# Modelling

## Steps

* Load Data Sets
* Select Relevant Columns
* write function to add file indicator
* Pipelines in Scikit Learn
* Modelling

In [577]:
# Loading Necessary Files

import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import io
import seaborn as sns
import time
import datetime

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder# creating instance of one-hot-encoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

#from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
#from statsmodels.tsa.arima_model import ARIMA
#from statsmodels.tsa.stattools import adfuller
from geopy.distance import geodesic
from geopy.distance import distance
from geopy import Point


In [564]:
# Loading Kaggle Files

train_data = pd.read_csv("train.csv", encoding= 'unicode_escape', parse_dates = ['Date'])
test_data = pd.read_csv("test.csv", encoding= 'unicode_escape', parse_dates = ['Date'])
submission_data = pd.read_csv("submission.csv", encoding= 'unicode_escape')

# Loading Distance From China Data
lat_long = pd.read_csv("johns-hopkins-covid-19-daily-dashboard-cases-by-country.csv", encoding= 'unicode_escape')

# Loading Government Measurement Data

govt_measures_data = pd.read_csv("acaps-covid-19-government-measures-dataset.csv", encoding= 'unicode_escape')

# Loading Covid Indicators Data

covid_indicators_data = pd.read_csv("inform-covid-indicators.csv", encoding= 'unicode_escape')

# Combine Train and Test Data

In [565]:
#Adding Indicator Columns to identify datasets
train_data['data_set'] = 'Train'
test_data['data_set'] = 'Test'

#Convert Target columns into log scale
train_data['ConfirmedCases'] = np.log(train_data['ConfirmedCases']+1)
train_data['Fatalities'] = np.log(train_data['Fatalities']+1)

#Adding columns to test data set
test_data = test_data.rename(columns={"ForecastId": "Id"})
test_data['ConfirmedCases'] = None
test_data['Fatalities'] = None

data = pd.concat([train_data,test_data])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [566]:
# Days since first occurence

data['days'] = data['Date']-data['Date'].min()
data['days'] = data['days'].astype('timedelta64[D]').astype('int32')

# Prepare Government Measures Data

In [567]:
#Clean Data

var_req = ['country', 'measure']
govt_measures_data = govt_measures_data[var_req]
govt_measures_data['measure'] = govt_measures_data['measure'].str.lower()
govt_measures_data = govt_measures_data.drop_duplicates()


In [568]:
#Create Categorical Columns

govt_measures_data = govt_measures_data.reset_index()
govt_measures_data['val'] = 1
govt_measures_data = govt_measures_data.set_index(['index','country','measure']).unstack(level=2).fillna(0).groupby('country').max()
govt_measures_data = govt_measures_data.reset_index()

In [569]:
#Renaming Columns

names = govt_measures_data.columns
new_names = ['Country_Region', 'additional health/documents requirements upon arrival', 'amendments to funeral and burial regulations', 'awareness campaigns', 'border checks', 'border closure', 'changes in prison-related policies', 'checkpoints within the country', 'complete border closure', 'curfews', 'domestic travel restrictions', 'economic measures', 'emergency administrative structures activated or established', 'full lockdown', 'general recommendations', 'health screenings in airports and border crossings', 'humanitarian exemptions', 'international flights suspension', 'introduction of quarantine policies', 'limit product imports/exports', 'limit public gatherings', 'lockdown of refugee/idp camps or other minorities', 'mass population testing', 'military deployment', 'obligatory medical tests not related to covid-19', 'other public health measures enforced', 'partial lockdown', 'psychological assistance and medical social work', 'public services closure', 'requirement to wear protective gear in public', 'schools closure', 'state of emergency declared', 'strengthening the public health system', 'surveillance and monitoring', 'testing policy', 'visa restrictions']
new_names = ["gm_"+ s for s in new_names]
new_names[0] = 'Country_Region'
govt_measures_data.columns = new_names


In [570]:
#Join with the original dataset

data = data.merge(govt_measures_data, how = 'left', on='Country_Region')

# Add Distance From China

In [571]:
lat_long = lat_long[['country_region','lat','long']]
lat_long = lat_long.dropna(0)

#Wuhan Co-ordinates
Wuhan_Cord = (30.583332, 114.2833330)

#Calculate Distance from China
def calc_distance(row, site_coords):
    target_coords = (row['lat'], row['long'])
    dist = geodesic(site_coords, target_coords).miles
    return(dist)

lat_long['distance_from_china'] = lat_long.apply(calc_distance, site_coords=Wuhan_Cord, axis=1)

#Get Rid of Lat, Long Columns
lat_long = lat_long.rename(columns={"country_region": "Country_Region"}).drop(['lat', 'long'], axis=1)

In [572]:
data = data.merge(lat_long, how = 'left', on='Country_Region')

# Add CoVID Indicators

In [573]:
# Adding Indicator Columns
covid_indicators_data = covid_indicators_data.drop(['iso3'], axis=1)
names2 = covid_indicators_data.columns
names2 = ["ci_"+ s for s in names2]
names2[0] = 'Country_Region'
covid_indicators_data.columns = names2
covid_indicators_data = covid_indicators_data.replace({'No data': 0.0001, 'x':0.0001})

In [574]:
# Merge with Existing Data
data = data.merge(covid_indicators_data, how = 'left', on='Country_Region')

# Prepare Data

In [575]:
#Define function to deal with overfitting/incorrect predictions later on 
def data_prep(data):
    data = data.astype('float32')
    data = np.nan_to_num(data)
    
    pt = PowerTransformer()
    pt.fit_transform(data)
    
    scaler = StandardScaler()
    scaler.fit_transform(data)

    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit_transform(data)

    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    sel.fit_transform(data)
    
    return(data)

In [576]:
#Isolate Training and Testing Test

#Training Set
train = data[data['data_set'] == 'Train']
train = train.drop(['data_set', 'Id', 'Province_State', 'Country_Region', 'Date'], axis = 1)

train = data_prep(train)

#Testing Set
test = data[data['data_set'] == 'Test']
test = test.drop(['data_set', 'Id', 'Province_State', 'Country_Region', 'Date','ConfirmedCases', 'Fatalities'], axis = 1)

test = data_prep(test)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)


# Split into train and test 



Unnamed: 0,ConfirmedCases,Country_Region,Date,Fatalities,Id,Province_State,data_set,days,gm_additional health/documents requirements upon arrival,gm_amendments to funeral and burial regulations,...,ci_inform_health_conditions,ci_inform_epidemic_vulnerability,ci_mortality_rate_under_5,ci_prevalence_of_undernourishment,ci_inform_lack_of_coping_capacity,ci_inform_access_to_healthcare,ci_inform_epidemic_lack_of_coping_capacity,ci_physicians_density,ci_current_health_expenditure_per_capita,ci_maternal_mortality_ratio
77,0.0,Albania,2020-01-22,0.0,108,,Train,0.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
78,0.0,Albania,2020-01-23,0.0,109,,Train,1.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
79,0.0,Albania,2020-01-24,0.0,110,,Train,2.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
80,0.0,Albania,2020-01-25,0.0,111,,Train,3.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
81,0.0,Albania,2020-01-26,0.0,112,,Train,4.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
82,0.0,Albania,2020-01-27,0.0,113,,Train,5.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
83,0.0,Albania,2020-01-28,0.0,114,,Train,6.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
84,0.0,Albania,2020-01-29,0.0,115,,Train,7.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
85,0.0,Albania,2020-01-30,0.0,116,,Train,8.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
86,0.0,Albania,2020-01-31,0.0,117,,Train,9.0,0.0,0.0,...,0.2,3.7,8.8000001907348597,6.2,4.2,3.7,4.9,12.00,759.67,29.00
