In [111]:
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
%matplotlib inline

In [103]:
path = 'data/kaggle/'
data = pd.read_csv(path + 'train.csv',
                   parse_dates=['Date'],
                   dtype={
                       'Id': 'int',
                       'Province_State': 'category',
                       'Country_Region': 'category',
                       'ConfirmedCases': 'float',
                       'Fatalities': 'float'
                   })
data.replace([np.inf, -np.inf], np.nan)
print(data.shape)
data

(22644, 6)


Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0
...,...,...,...,...,...,...
22639,32705,,Zimbabwe,2020-03-31,8.0,1.0
22640,32706,,Zimbabwe,2020-04-01,8.0,1.0
22641,32707,,Zimbabwe,2020-04-02,9.0,1.0
22642,32708,,Zimbabwe,2020-04-03,9.0,1.0


In [104]:
print(data['Province_State'].describe())
data.groupby('Province_State')['ConfirmedCases'].sum()

count         9620
unique         130
top       Zhejiang
freq            74
Name: Province_State, dtype: object


Province_State
Alabama     11316.0
Alaska       1310.0
Alberta      9429.0
Anguilla       19.0
Anhui       59527.0
             ...   
Wyoming      1344.0
Xinjiang     4435.0
Yukon          46.0
Yunnan      10979.0
Zhejiang    76233.0
Name: ConfirmedCases, Length: 130, dtype: float64

more than half province state is NA, dropping province state sounds ok, we can link population density later if needed.

In [105]:
#data.drop('Province_State', axis=1, inplace=True)
data.drop('Id', axis=1, inplace=True)

### EDA

In [140]:
cases = data.groupby(['Country_Region', 'Date'])['ConfirmedCases'].sum()
cases_latest = cases.iloc[cases.index.get_level_values('Date') == datetime.datetime(2020,4, 4)]
ind = np.argsort(cases_latest)[::-1]

deaths = data.groupby(['Country_Region', 'Date'])['Fatalities'].sum()
deaths_latest = deaths.iloc[deaths.index.get_level_values('Date') == datetime.datetime(2020,4, 4)]

In [141]:
fatality_rate_bycountry = deaths_latest / cases_latest

In [142]:
indf = np.argsort(fatality_rate_bycountry)[::-1]
fatality_rate_bycountry = fatality_rate_bycountry[indf]

In [143]:
top_20_fatal = fatality_rate_bycountry[:20]
top_20_cases = cases_latest[:20]

In [144]:
top_20_fatal

Country_Region    Date      
Botswana          2020-04-04    0.250000
Gambia            2020-04-04    0.250000
MS Zaandam        2020-04-04    0.222222
Sudan             2020-04-04    0.200000
Angola            2020-04-04    0.200000
Nicaragua         2020-04-04    0.200000
Guyana            2020-04-04    0.173913
Mauritania        2020-04-04    0.166667
Bahamas           2020-04-04    0.142857
Cabo Verde        2020-04-04    0.142857
Syria             2020-04-04    0.125000
San Marino        2020-04-04    0.123552
Italy             2020-04-04    0.123259
Congo (Kinshasa)  2020-04-04    0.116883
Bangladesh        2020-04-04    0.114286
Zimbabwe          2020-04-04    0.111111
Algeria           2020-04-04    0.103917
United Kingdom    2020-04-04    0.101704
Liberia           2020-04-04    0.100000
Suriname          2020-04-04    0.100000
dtype: float64

In [145]:
top_20_cases

Country_Region       Date      
Afghanistan          2020-04-04      299.0
Albania              2020-04-04      333.0
Algeria              2020-04-04     1251.0
Andorra              2020-04-04      466.0
Angola               2020-04-04       10.0
Antigua and Barbuda  2020-04-04       15.0
Argentina            2020-04-04     1451.0
Armenia              2020-04-04      770.0
Australia            2020-04-04     5550.0
Austria              2020-04-04    11781.0
Azerbaijan           2020-04-04      521.0
Bahamas              2020-04-04       28.0
Bahrain              2020-04-04      688.0
Bangladesh           2020-04-04       70.0
Barbados             2020-04-04       52.0
Belarus              2020-04-04      440.0
Belgium              2020-04-04    18431.0
Belize               2020-04-04        4.0
Benin                2020-04-04       16.0
Bhutan               2020-04-04        5.0
Name: ConfirmedCases, dtype: float64

Aside from Spain and Italy, top 20 fatalaties do not have the highest number of cases, in fact it seems African/Developed coutries, related to health care system? most likely, 
we can check this dataset later: https://www.kaggle.com/danevans/world-bank-wdi-212-health-systems

other factor could be pollution, China has considerably higher pollution rate than most countries but health care and tech advance plays a huge role clearly. We'll see...

## Data Featuring

In [87]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['weekday'] = data['Date'].dt.weekday

weekday_name = data['Date'].dt.weekday_name  # for reference if needed

In [88]:
#data.drop('Date', axis=1, inplace=True)

In [89]:
cases, fatalities = data['ConfirmedCases'], data['Fatalities']  # Get y_1, y_2
data.drop('ConfirmedCases', axis=1, inplace=True)
data.drop('Fatalities', axis=1, inplace=True)

### Join density data
source = https://worldpopulationreview.com/countries/countries-by-density/#dataTable

In [90]:
dens = {}
with open('data/rho.json', 'r') as js:
    d = json.load(js)['data']
for js in d:
    dens[js['name']] = js['density']  ## /km2

In [91]:
data['Density'] = data['Country_Region'].apply(lambda c: float(dens[c]) if c in dens else float('nan'))

### Join health care data

In [92]:
health_care = pd.read_csv(path+'../2.12_Health_systems.csv')

In [93]:
missing_countries = set(data['Country_Region']) - set(health_care['Country_Region'])
print ("Missing countries are: {}".format(missing_countries))

Missing countries are: {'Kosovo', 'Sierra Leone', 'Holy See', 'West Bank and Gaza', 'Taiwan*', 'Diamond Princess', 'MS Zaandam', 'Burma'}


In [94]:
countries = set(data['Country_Region'])
health_js = {}
for i, rec in health_care.iterrows():
    if rec['Country_Region'] in countries:
        health_js[rec['Country_Region']] = [
            rec['Health_exp_pct_GDP_2016'], rec['Physicians_per_1000_2009-18'],
            rec['Completeness_of_death_reg_2008-16']
        ]

data['Health_GDP'] = data['Country_Region'].apply(
    lambda c: health_js[c][0] if c in health_js else float('nan'))
data['Physicians'] = data['Country_Region'].apply(
    lambda c: health_js[c][1] if c in health_js else float('nan'))
data['Completeness_of_death_reg'] = data['Country_Region'].apply(
    lambda c: health_js[c][2] if c in health_js else float('nan'))

we just need to fill Density, Health GDP, and physicians with mean from other countries and create new columns

In [95]:

for column in data:
    print(column)
    print(
        f"\t\t dtype: {data[column].dtype}, # NA's{data[column].isnull().sum()}"
    )
    print()

Province_State
		 dtype: category, # NA's13024

Country_Region
		 dtype: category, # NA's0

Date
		 dtype: datetime64[ns], # NA's0

Year
		 dtype: int64, # NA's0

Month
		 dtype: int64, # NA's0

Day
		 dtype: int64, # NA's0

weekday
		 dtype: int64, # NA's0

Density
		 dtype: float64, # NA's5106

Health_GDP
		 dtype: float64, # NA's8584

Physicians
		 dtype: float64, # NA's8510

Completeness_of_death_reg
		 dtype: float64, # NA's13098



In [96]:
for column in ['Density', 'Health_GDP', 'Health_GDP', 'Completeness_of_death_reg']:
    data[column+'_NA'] = data[column].isnull()
    data[column] = data[column].fillna(data[column].mean())

In [97]:
data.drop('Date', axis=1, inplace=True)

In [98]:
data.head()

Unnamed: 0,Province_State,Country_Region,Year,Month,Day,weekday,Density,Health_GDP,Physicians,Completeness_of_death_reg,Density_NA,Health_GDP_NA,Completeness_of_death_reg_NA
0,,Afghanistan,2020,1,22,2,59.685,10.2,0.3,91.907752,False,False,True
1,,Afghanistan,2020,1,23,3,59.685,10.2,0.3,91.907752,False,False,True
2,,Afghanistan,2020,1,24,4,59.685,10.2,0.3,91.907752,False,False,True
3,,Afghanistan,2020,1,25,5,59.685,10.2,0.3,91.907752,False,False,True
4,,Afghanistan,2020,1,26,6,59.685,10.2,0.3,91.907752,False,False,True


In [99]:
data['ConfirmedCases'], data['Fatalities'] = cases, fatalities

In [100]:
data.to_feather('data/covid19_data')

### Feature importance and analysis 

In [None]:
data['']