In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

In [2]:
pd.set_option('display.max_rows', 1000)

Read files

In [3]:
# Get the paths where all the files are stored
current_path = os.getcwd()
train_path = current_path + '/train.csv'
test_path = current_path + '/test.csv'

In [4]:
# read csv files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print('shape of train data:',train_df.shape)
print('shape of test data:',test_df.shape)

shape of train data: (16756, 8)
shape of test data: (12212, 6)


In [5]:
train_df.head()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [6]:
test_df['Date'].value_counts()

2020-03-14    284
2020-04-12    284
2020-04-19    284
2020-03-29    284
2020-04-03    284
2020-03-24    284
2020-04-10    284
2020-04-08    284
2020-04-23    284
2020-04-04    284
2020-03-13    284
2020-04-14    284
2020-03-21    284
2020-03-12    284
2020-04-22    284
2020-03-19    284
2020-04-05    284
2020-04-01    284
2020-04-13    284
2020-04-02    284
2020-04-06    284
2020-04-11    284
2020-04-20    284
2020-03-23    284
2020-04-07    284
2020-03-20    284
2020-03-16    284
2020-03-31    284
2020-03-15    284
2020-03-22    284
2020-04-17    284
2020-03-18    284
2020-04-15    284
2020-04-21    284
2020-03-25    284
2020-03-26    284
2020-03-27    284
2020-04-09    284
2020-03-30    284
2020-04-16    284
2020-03-17    284
2020-04-18    284
2020-03-28    284
Name: Date, dtype: int64

Perform data transformation

In [7]:
# Convert confirmed cases and fatalities columns to int as they are counts
train_df['ConfirmedCases'] = train_df['ConfirmedCases'].astype(int)
train_df['Fatalities'] = train_df['Fatalities'].astype(int)

In [8]:
# Convert string date time to datetime object
train_df['Modified_Date'] = pd.to_datetime(train_df['Date'])

In [9]:
# Extract month from datetime object
train_df["month"] = train_df['Modified_Date'].map(lambda x: x.month)

In [10]:
train_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,2020-01-22,1
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0,0,2020-01-23,1
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0,0,2020-01-24,1
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0,0,2020-01-25,1
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0,0,2020-01-26,1
...,...,...,...,...,...,...,...,...,...,...
16751,26374,,Zambia,-15.4167,28.2833,2020-03-16,0,0,2020-03-16,3
16752,26375,,Zambia,-15.4167,28.2833,2020-03-17,0,0,2020-03-17,3
16753,26376,,Zambia,-15.4167,28.2833,2020-03-18,2,0,2020-03-18,3
16754,26377,,Zambia,-15.4167,28.2833,2020-03-19,2,0,2020-03-19,3


In [11]:
train_df[train_df['Province/State'].isnull()== True]

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,2020-01-22,1
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0,0,2020-01-23,1
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0,0,2020-01-24,1
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0,0,2020-01-25,1
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0,0,2020-01-26,1
...,...,...,...,...,...,...,...,...,...,...
16751,26374,,Zambia,-15.4167,28.2833,2020-03-16,0,0,2020-03-16,3
16752,26375,,Zambia,-15.4167,28.2833,2020-03-17,0,0,2020-03-17,3
16753,26376,,Zambia,-15.4167,28.2833,2020-03-18,2,0,2020-03-18,3
16754,26377,,Zambia,-15.4167,28.2833,2020-03-19,2,0,2020-03-19,3


In [12]:
unique_country = train_df['Country/Region'].unique()
unique_state = train_df['Province/State'].unique()

In [13]:
# Create a dictionary to tag string columns to numbers for countries column
unique_country_to_index_dict = {}
index_to_unique_country_dict = {}
for i, val in enumerate(unique_country):
    unique_country_to_index_dict[val] = i
    index_to_unique_country_dict[i] = val

In [14]:
# Create a dictionary to tag string columns to numbers for state column
unique_state_to_index_dict = {}
index_to_unique_state_dict = {}
for i, val in enumerate(unique_state):
    unique_state_to_index_dict[val] = i
    index_to_unique_state_dict[i] = val

In [15]:
# Apply the transformations from dictionary to columns
train_df['country_index'] = train_df['Country/Region'].apply(lambda x:unique_country_to_index_dict[x])
train_df['state_index'] = train_df['Province/State'].apply(lambda x:unique_state_to_index_dict[x])


In [16]:
# Extract data based on months
month1_df = train_df[train_df['month'] == 1]
month2_df = train_df[train_df['month'] == 2]
month3_df = train_df[train_df['month'] == 3]

In [17]:
month1_df

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,Modified_Date,month,country_index,state_index
0,1,,Afghanistan,33.0000,65.0000,2020-01-22,0,0,2020-01-22,1,0,0
1,2,,Afghanistan,33.0000,65.0000,2020-01-23,0,0,2020-01-23,1,0,0
2,3,,Afghanistan,33.0000,65.0000,2020-01-24,0,0,2020-01-24,1,0,0
3,4,,Afghanistan,33.0000,65.0000,2020-01-25,0,0,2020-01-25,1,0,0
4,5,,Afghanistan,33.0000,65.0000,2020-01-26,0,0,2020-01-26,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16702,26325,,Zambia,-15.4167,28.2833,2020-01-27,0,0,2020-01-27,1,162,0
16703,26326,,Zambia,-15.4167,28.2833,2020-01-28,0,0,2020-01-28,1,162,0
16704,26327,,Zambia,-15.4167,28.2833,2020-01-29,0,0,2020-01-29,1,162,0
16705,26328,,Zambia,-15.4167,28.2833,2020-01-30,0,0,2020-01-30,1,162,0


In [18]:
month3_df[month3_df['ConfirmedCases']>0]['Country/Region'].value_counts()

China                               660
US                                  585
Australia                           140
Canada                              126
France                               91
United Kingdom                       59
Denmark                              37
Netherlands                          35
Japan                                20
Russia                               20
San Marino                           20
Oman                                 20
Egypt                                20
Azerbaijan                           20
North Macedonia                      20
Iraq                                 20
Iceland                              20
Vietnam                              20
Lebanon                              20
Austria                              20
Sri Lanka                            20
Mexico                               20
Georgia                              20
Sweden                               20
Romania                              20


In [19]:
month1_df['Country/Region'].value_counts()

US                                  580
China                               330
Canada                              110
Australia                            90
France                               80
United Kingdom                       50
Netherlands                          30
Denmark                              20
Estonia                              10
Iran                                 10
Luxembourg                           10
Serbia                               10
Switzerland                          10
Barbados                             10
Congo (Kinshasa)                     10
Bolivia                              10
Lebanon                              10
Benin                                10
South Africa                         10
Argentina                            10
Russia                               10
Japan                                10
Saint Lucia                          10
Montenegro                           10
Andorra                              10
