# Covid Data Creation

This notebook will include the complete development of the the dataset for the various countries for which we have formed a contact matrix analysis (144 different countries)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import preprocessing

In [3]:
data_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\UGP_data_notebook2.csv")
data_df.drop(['Unnamed: 0'], axis = 1, inplace = True)
print(data_df.shape)
data_df.head()

(146, 9)


Unnamed: 0,Country,Population,Total_index,Home,Other,School,Work,Population_Density,Reproduction
0,Albania,2880913,38380680.0,10683260.0,14303260.0,5625936.0,7768229.0,105.0,0.95
1,Algeria,43053054,637359600.0,175955300.0,286382100.0,114798700.0,60223550.0,18.41,0.93
2,Antigua and Barbuda,97115,1584368.0,416127.4,546484.4,224288.5,397467.9,222.6,1.4
3,Argentina,44780675,673122100.0,202079000.0,237314500.0,96132510.0,137596100.0,16.52,0.94
4,Armenia,2957728,41850800.0,11644230.0,14558150.0,6163886.0,9484537.0,104.1,1.1


In [4]:
covid_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\UGP\world_covid.csv")
covid_df.head()

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0,0
1,2020-01-23,Afghanistan,0,0,0
2,2020-01-24,Afghanistan,0,0,0
3,2020-01-25,Afghanistan,0,0,0
4,2020-01-26,Afghanistan,0,0,0


In [5]:
countries = list(data_df['Country'])
countries

['Albania',
 'Algeria',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran (Islamic Republic of)',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 "Lao People's Democratic Republi",
 'Latvia',


In [6]:
countries_sim = list(covid_df['Country'].unique())
countries_sim

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',

Inorder to automate we need to find those values that are not present in mainly the countries list that we have of the contact matrix

In [7]:
diff = []
for item in countries:
    if item not in countries_sim:
        diff.append(item)
diff

['Bolivia (Plurinational State of',
 'Brunei Darussalam',
 'Congo',
 'Czech Republic',
 'Iran (Islamic Republic of)',
 'Kiribati',
 "Lao People's Democratic Republi",
 'Republic of Korea',
 'Russian Federation',
 'Sao Tome and Principe ',
 'Syrian Arab Republic',
 'TFYR of Macedonia',
 'Tonga',
 'United Kingdom of Great Britain',
 'United States of America',
 'Venezuela (Bolivarian Republic ',
 'Viet Nam']

Lets try to see if the above difference is due to inavailability of data or title mismatch

• Viet Nam - Vietnam
• Venezuela (Bolivarian Republic - Venezuela
• United Kingdom of Great Britain - United Kingdom
• Syrian Arab Republic - Syria
• Sao Tome and Principe  - remove the space at the end
• Russian Federeation - Russia
• Republic of Korea - Korea, South
• Lao People's Democratic Republi - Laos
• Iran (Islamic Republic of) - Iran
• Czech Republic - Czechia
• Congo - Congo (Brazzaville)
• Bolivia (Plurinational State of - Bolivia

In [8]:
temp = data_df[data_df['Country']=='Viet Nam'].index.values[0]
data_df.at[temp,'Country'] = 'Vietnam'
temp = data_df[data_df['Country']=='Venezuela (Bolivarian Republic '].index.values[0]
data_df.at[temp,'Country'] = 'Venezuela'
temp = data_df[data_df['Country']=='United Kingdom of Great Britain'].index.values[0]
data_df.at[temp,'Country'] = 'United Kingdom'
temp = data_df[data_df['Country']=='Syrian Arab Republic'].index.values[0]
data_df.at[temp,'Country'] = 'Syria'
temp = data_df[data_df['Country']=='Sao Tome and Principe '].index.values[0]
data_df.at[temp,'Country'] = 'Sao Tome and Principe'
temp = data_df[data_df['Country']=='Russian Federation'].index.values[0]
data_df.at[temp,'Country'] = 'Russia'
temp = data_df[data_df['Country']=='Republic of Korea'].index.values[0]
data_df.at[temp,'Country'] = 'Korea, South'
temp = data_df[data_df['Country']=="Lao People's Democratic Republi"].index.values[0]
data_df.at[temp,'Country'] = 'Laos'
temp = data_df[data_df['Country']=='Iran (Islamic Republic of)'].index.values[0]
data_df.at[temp,'Country'] = 'Iran'
temp = data_df[data_df['Country']=='Czech Republic'].index.values[0]
data_df.at[temp,'Country'] = 'Czechia'
temp = data_df[data_df['Country']=='Congo'].index.values[0]
data_df.at[temp,'Country'] = 'Congo (Brazzaville)'
temp = data_df[data_df['Country']=='Bolivia (Plurinational State of'].index.values[0]
data_df.at[temp,'Country'] = 'Bolivia'

In [9]:
countries = list(data_df['Country'])

In [10]:
diff = []
for item in countries:
    if item not in countries_sim:
        diff.append(item)
diff

['Brunei Darussalam',
 'Kiribati',
 'TFYR of Macedonia',
 'Tonga',
 'United States of America']

Let's create a list for which we require the dataset

In [11]:
final_count = []
for item in countries:
    if item in countries_sim:
        final_count.append(item)
final_count

['Albania',
 'Algeria',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Maldives',
 'Malta',


## Creating different Datasets

In [12]:
covid_df[covid_df['Country'] == 'Yemen'].reset_index()

Unnamed: 0,index,Date,Country,Confirmed,Recovered,Deaths
0,80892,2020-01-22,Yemen,0,0,0
1,80893,2020-01-23,Yemen,0,0,0
2,80894,2020-01-24,Yemen,0,0,0
3,80895,2020-01-25,Yemen,0,0,0
4,80896,2020-01-26,Yemen,0,0,0
...,...,...,...,...,...,...
423,81315,2021-03-20,Yemen,3278,1530,737
424,81316,2021-03-21,Yemen,3418,1534,751
425,81317,2021-03-22,Yemen,3516,1546,771
426,81318,2021-03-23,Yemen,3612,1566,785


In [13]:
temp = data_df[data_df['Country']=='Albania'].index.values[0]
data_df.at[temp,'Population']

2880913

In [19]:
for item in final_count:
    df = covid_df[covid_df['Country'] == item].reset_index()
    df = df[df['Confirmed'] > 0].reset_index()
    df = df.drop(columns=['index'])
    df = df.drop(columns=['level_0'])
    temp = data_df[data_df['Country']==item].index.values[0]
    val = data_df.at[temp,'Population']
    df = df.drop(columns=['Country','Date'])
    df['Population'] = val
    s = "C:\\Users\\varun\\Desktop\\Udacity\\UGP\\Covid\\" + item + ".csv"
    df.to_csv(s, index=False, header = False)

# Mobility Time Series

Since we are concerned about the country India in particular it would make much sense if we try to build the mobility time series for different regions in india. <br/>
• Acquire the population for the various regions in india as mentioned in the google mobility dataset <br/>
• Predict the contact rates based upon the model which we predicted <br/>
• Construct the time series

In [12]:
data_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\UGP\Mobility\India.csv")
data_df.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,recreation,grocery,parks,stations,workplace,home
0,IN,India,,,,,,ChIJkbeSa_BfYzARphNChaFPjNc,15-02-2020,1.0,2.0,3.0,3.0,5.0,0.0
1,IN,India,,,,,,ChIJkbeSa_BfYzARphNChaFPjNc,16-02-2020,2.0,2.0,3.0,2.0,0.0,0.0
2,IN,India,,,,,,ChIJkbeSa_BfYzARphNChaFPjNc,17-02-2020,-1.0,1.0,3.0,1.0,4.0,0.0
3,IN,India,,,,,,ChIJkbeSa_BfYzARphNChaFPjNc,18-02-2020,0.0,2.0,4.0,2.0,3.0,0.0
4,IN,India,,,,,,ChIJkbeSa_BfYzARphNChaFPjNc,19-02-2020,0.0,2.0,1.0,1.0,4.0,1.0


In [24]:
data_df['sub_region_2'] = data_df['sub_region_2'].str.lower()

In [25]:
l1 = list(data_df['sub_region_2'].unique())
l1

[nan,
 'north and middle andaman',
 'south andaman',
 'anantapuram',
 'chittoor',
 'east godavari',
 'guntur',
 'krishna',
 'kurnool',
 'prakasam',
 'sri potti sriramulu nellore district',
 'srikakulam',
 'vishakhapatnam',
 'vizianagaram',
 'west godavari',
 'ysr district',
 'east kameng',
 'east siang',
 'papum pare',
 'tawang',
 'west kameng',
 'baksa',
 'barpeta',
 'bongaigaon',
 'cachar',
 'chirang',
 'darrang',
 'dhemaji',
 'dhubri',
 'dibrugarh',
 'dima hasao',
 'goalpara',
 'golaghat',
 'hailakandi',
 'jorhat',
 'kamrup',
 'kamrup metropolitan',
 'karbi anglong',
 'karimganj',
 'kokrajhar',
 'lakhimpur',
 'morigaon',
 'nagaon',
 'nalbari',
 'sivasagar',
 'sonitpur',
 'tinsukia',
 'udalguri',
 'araria',
 'arwal',
 'aurangabad',
 'banka',
 'begusarai',
 'bhagalpur',
 'bhojpur',
 'buxar',
 'darbhanga',
 'gaya',
 'gopalganj',
 'jamui',
 'jehanabad',
 'kaimur',
 'katihar',
 'khagaria',
 'kishanganj',
 'lakhisarai',
 'madhepura',
 'madhubani',
 'munger',
 'muzaffarpur',
 'nalanda',
 '

In [18]:
len(l1)

628

In [20]:
pop_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\UGP\Mobility\pop_data.csv")
pop_df.head()

Unnamed: 0,Population,name
0,12691836,Mumbai
1,10927986,Delhi
2,5104047,Bengaluru
3,4631392,Kolkata
4,4328063,Chennai


In [26]:
pop_df['name'] = pop_df['name'].str.lower()

In [27]:
l2 = list(pop_df['name'].unique())
l2

['mumbai',
 'delhi',
 'bengaluru',
 'kolkata',
 'chennai',
 'ahmedabad',
 'hyderabad',
 'pune',
 'surat',
 'kanpur',
 'jaipur',
 'navi mumbai',
 'lucknow',
 'nagpur',
 'indore',
 'patna',
 'bhopal',
 'ludhiana',
 'tirunelveli',
 'agra',
 'vadodara',
 'gorakhpur',
 'nashik',
 'pimpri',
 'kalyan',
 'thane',
 'meerut',
 'nowrangapur',
 'faridabad',
 'ghaziabad',
 'dombivli',
 'rajkot',
 'varanasi',
 'amritsar',
 'allahabad',
 'visakhapatnam',
 'teni',
 'jabalpur',
 'haora',
 'aurangabad',
 'shivaji nagar',
 'solapur',
 'srinagar',
 'chandigarh',
 'coimbatore',
 'jodhpur',
 'madurai',
 'guwahati',
 'gwalior',
 'vijayawada',
 'mysore',
 'ranchi',
 'hubli',
 'jalandhar',
 'thiruvananthapuram',
 'salem',
 'tiruchirappalli',
 'kota',
 'bhubaneshwar',
 'aligarh',
 'bareilly',
 'moradabad',
 'bhiwandi',
 'raipur',
 'bhilai',
 'jamshedpur',
 'borivli',
 'cochin',
 'amravati',
 'sangli',
 'cuttack',
 'bikaner',
 'warangal',
 'bhavnagar',
 'nanded',
 'raurkela',
 'guntur',
 'dehra dun',
 'bhayandar

In [28]:
diff = []
for item in l2:
    if item not in l1:
        diff.append(item)
diff

['delhi',
 'bengaluru',
 'kanpur',
 'navi mumbai',
 'pimpri',
 'kalyan',
 'nowrangapur',
 'dombivli',
 'allahabad',
 'visakhapatnam',
 'teni',
 'haora',
 'shivaji nagar',
 'srinagar',
 'chandigarh',
 'guwahati',
 'vijayawada',
 'mysore',
 'hubli',
 'bhubaneshwar',
 'bhiwandi',
 'bhilai',
 'jamshedpur',
 'borivli',
 'cochin',
 'raurkela',
 'dehra dun',
 'bhayandar',
 'durgapur',
 'ulhasnagar',
 'shiliguri',
 'bilimora',
 'karol bagh',
 'asansol',
 'bhatpara',
 'jammu',
 'ramgundam',
 'shyamnagar',
 'nangi',
 'malegaon',
 'davangere',
 'belgaum',
 'mangalore',
 'nellore',
 'panihati',
 'ahmadnagar',
 'dhulia',
 'punasa',
 'kukatpalli',
 'ambattur',
 'kamarhati',
 'chanda',
 'trichur',
 'brahmapur',
 'shahjanpur',
 'kulti',
 'rajahmundry',
 'barddhaman',
 'barasat',
 'bali',
 'noida',
 'greater noida',
 'kakinada',
 'ichalkaranji',
 'lal bahadur nagar',
 'baranagar',
 'gajuwaka',
 'naihati',
 'tirupati',
 'sonipat',
 'avadi',
 'tiruvottiyur',
 'saugor',
 'bihar sharif',
 'anantapur',
 'ra