# Modelling

## Steps

* Load Data Sets
* Select Relevant Columns
* write function to add file indicator
* Pipelines in Scikit Learn
* Modelling

In [78]:
# Loading Necessary Files

import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import io
import seaborn as sns
import time
import datetime

from sklearn.preprocessing import OneHotEncoder# creating instance of one-hot-encoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from geopy.distance import geodesic
from geopy import Point


In [138]:
# Loading Kaggle Files

train_data = pd.read_csv("train.csv", encoding= 'unicode_escape', parse_dates = ['Date'])
test_data = pd.read_csv("test.csv", encoding= 'unicode_escape', parse_dates = ['Date'])
submission_data = pd.read_csv("submission.csv", encoding= 'unicode_escape')

# Loading Distance From China Data
lat_long = pd.read_csv("johns-hopkins-covid-19-daily-dashboard-cases-by-country.csv", encoding= 'unicode_escape')

# Loading Government Measurement Data

govt_measures_data = pd.read_csv("acaps-covid-19-government-measures-dataset.csv", encoding= 'unicode_escape')

# Loading Covid Indicators Data

covid_indicators_data = pd.read_csv("inform-covid-indicators.csv", encoding= 'unicode_escape')

# Combine Train and Test Data

In [139]:
#Adding Indicator Columns to identify datasets
train_data['data_set'] = 'Train'
test_data['data_set'] = 'Test'

#Adding columns to test data set
test_data = test_data.rename(columns={"ForecastId": "Id"})
test_data['ConfirmedCases'] = None
test_data['Fatalities'] = None

data = pd.concat([train_data,test_data])


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


# Prepare Government Measures Data

In [140]:
var_req = ['country', 'measure']
govt_measures_data = govt_measures_data[var_req]
govt_measures_data.drop_duplicates()

#Create Categorical Columns
govt_measures_data = govt_measures_data.reset_index()
govt_measures_data['val'] = 1
govt_measures_data = govt_measures_data.set_index(['index','country','measure']).unstack(level=2).fillna(0).groupby('country').max()

In [141]:
govt_measures_data.columns

MultiIndex(levels=[['val'], ['Additional health/documents requirements upon arrival', 'Amendments to funeral and burial regulations', 'Awareness campaigns', 'Border checks', 'Border closure', 'Changes in prison-related policies', 'Checkpoints within the country', 'Complete border closure', 'Curfews', 'Domestic travel restrictions', 'Economic measures', 'Emergency administrative structures activated or established', 'Full lockdown', 'General recommendations', 'Health screenings in airports and border crossings', 'Humanitarian exemptions', 'International flights suspension', 'Introduction of quarantine policies', 'Limit product imports/exports', 'Limit public gatherings', 'Lockdown of refugee/idp camps or other minorities', 'Mass population testing', 'Military deployment', 'Obligatory medical tests not related to COVID-19', 'Other public health measures enforced', 'Partial lockdown', 'Psychological assistance and medical social work', 'Public services closure', 'Requirement to wear prote

In [76]:
data = train_data.merge(govt_measures_data, how = 'left', left_on='Country_Region', right_on='country')

# Add Distance From China

In [77]:
lat_long = lat_long[['country_region','lat','long']]

data = data.merge(lat_long, how = 'left', left_on ='Country_Region', right_on= 'country_region' )
data.isnull().sum()

Id                     0
Province_State    280049
Country_Region         0
Date                   0
ConfirmedCases         0
Fatalities             0
data_set               0
country             1078
iso                 1078
measure             1078
country_region         0
lat                  154
long                 154
dtype: int64

In [53]:
data['point'] = data.apply(lambda row: Point(latitude=row['lat'], longitude=row['long']), axis=1)
data

ValueError: ('Point coordinates must be finite. (nan, nan, 0.0) has been passed as coordinates.', 'occurred at index 228382')

In [54]:
data

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,country,iso,measure,country_region,lat,long
0,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Health screenings in airports and border cross...,Afghanistan,33.939110,67.709953
1,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Introduction of quarantine policies,Afghanistan,33.939110,67.709953
2,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Awareness campaigns,Afghanistan,33.939110,67.709953
3,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Emergency administrative structures activated ...,Afghanistan,33.939110,67.709953
4,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Limit public gatherings,Afghanistan,33.939110,67.709953
5,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Border closure,Afghanistan,33.939110,67.709953
6,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Schools closure,Afghanistan,33.939110,67.709953
7,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Limit public gatherings,Afghanistan,33.939110,67.709953
8,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Strengthening the public health system,Afghanistan,33.939110,67.709953
9,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan,AFG,Limit product imports/exports,Afghanistan,33.939110,67.709953
