In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta,date
import matplotlib.pyplot as plt
import folium
import plotly.express as px
from matplotlib import ticker 
from scipy.interpolate import make_interp_spline, BSpline
import pycountry_convert as pc
#import geopandas as gpd

## Interactive web-based dashboard https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6

## COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University
### https://github.com/CSSEGISandData/COVID-19/

### Get Data

In [2]:
# Retriving Dataset from 
# https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data
# which is updated daily

url_confirmed = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
df_confirmed = pd.read_csv( url_confirmed )

url_death = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
df_deaths = pd.read_csv( url_death )

url_recovered = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
df_recovered = pd.read_csv(url_recovered)
 
#url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"
#df_confirmed_US = pd.read_csv( url )

In [3]:
df_confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/6/21,1/7/21,1/8/21,1/9/21,1/10/21,1/11/21,1/12/21,1/13/21,1/14/21,1/15/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,53105,53207,53332,53400,53489,53538,53584,53584,53775,53831
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,61008,61705,62378,63033,63595,63971,64627,65334,65994,66635
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,101120,101382,101657,101913,102144,102369,102641,102860,103127,103381
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,8348,8348,8489,8586,8586,8586,8682,8818,8868,8946
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,17864,17974,18066,18156,18193,18254,18343,18425,18613,18679


In [4]:
# Check the size of the each of the three dataframe
print(df_confirmed.shape)
print(df_deaths.shape)
print(df_recovered.shape)

(272, 364)
(272, 364)
(257, 364)


In [5]:
# The recovered dataframe has less rows, because there is only one row for Canada, no data on Province/State level
df_recovered[df_recovered["Country/Region"]=='Canada']

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/6/21,1/7/21,1/8/21,1/9/21,1/10/21,1/11/21,1/12/21,1/13/21,1/14/21,1/15/21
39,,Canada,56.1304,-106.3468,0,0,0,0,0,0,...,537024,544047,551983,558594,565049,575152,582822,591131,599753,608322


In [6]:
# Transform from wide to long format table
df_confirmed = pd.melt(df_confirmed, id_vars=df_confirmed.columns[:4], 
                    value_vars = df_confirmed.columns[4:], 
                    var_name = 'date', 
                    value_name = 'confirmed')
df_confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [7]:
# Transform from wide to long format table
df_deaths = pd.melt(df_deaths, id_vars=df_deaths.columns[:4], 
                    value_vars = df_deaths.columns[4:], 
                    var_name = 'date', 
                    value_name = 'deaths')

In [8]:
# Transform from wide to long format table
df_recovered = pd.melt(df_recovered, id_vars=df_recovered.columns[:4], 
                    value_vars = df_recovered.columns[4:], 
                    var_name = 'date', 
                    value_name = 'recovered')

In [9]:
# Check the size of the each of the three dataframe
print(df_confirmed.shape)
print(df_deaths.shape)
print(df_recovered.shape)

(97920, 6)
(97920, 6)
(92520, 6)


In [10]:
# Summarize confirmed and deaths data by date
df_confirmed_canada = df_confirmed[df_confirmed['Country/Region'] == 'Canada'].groupby('date').sum()[['confirmed']]
df_deaths_canada = df_deaths[df_deaths['Country/Region'] == 'Canada'].groupby('date').sum()[['deaths']]

# Extract columns from recoveries table (drop the last deaths value column)
df_recovered_canada = df_recovered[df_recovered['Country/Region'] == 'Canada']
df_canada_template  = df_recovered_canada[df_recovered_canada.columns[:-1]].reset_index(drop=True)

# Join aggrregated confirmed and deaths data with extracted columns
df_confirmed_canada = df_canada_template.merge(df_confirmed_canada, how='inner', left_on='date', right_index=True)
df_deaths_canada = df_canada_template.merge(df_deaths_canada, how='inner', left_on='date', right_index=True)

# Add the agrregated data for Canada back to confirmed and deaths table
df_confirmed = df_confirmed[df_confirmed['Country/Region'] != 'Canada'].append(df_confirmed_canada)
df_deaths = df_deaths[df_deaths['Country/Region'] != 'Canada'].append(df_deaths_canada)

In [11]:
df_recovered.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,recovered
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [12]:
# Merge the three table together
df_data = df_confirmed.merge(df_deaths, how='inner', on=['Country/Region','Province/State','date'])
df_data = df_data.merge(df_recovered, how='inner', on=['Country/Region','Province/State','date'])

In [13]:
df_data.head()

Unnamed: 0,Province/State,Country/Region,Lat_x,Long_x,date,confirmed,Lat_y,Long_y,deaths,Lat,Long,recovered
0,,Afghanistan,33.93911,67.709953,1/22/20,0,33.93911,67.709953,0,33.93911,67.709953,0
1,,Albania,41.1533,20.1683,1/22/20,0,41.1533,20.1683,0,41.1533,20.1683,0
2,,Algeria,28.0339,1.6596,1/22/20,0,28.0339,1.6596,0,28.0339,1.6596,0
3,,Andorra,42.5063,1.5218,1/22/20,0,42.5063,1.5218,0,42.5063,1.5218,0
4,,Angola,-11.2027,17.8739,1/22/20,0,-11.2027,17.8739,0,-11.2027,17.8739,0


In [14]:
# Drop the columns 
df_data = df_data.drop(['Lat_x','Lat_y','Long_x','Long_y'],axis=1)

In [15]:
df_data.head()

Unnamed: 0,Province/State,Country/Region,date,confirmed,deaths,Lat,Long,recovered
0,,Afghanistan,1/22/20,0,0,33.93911,67.709953,0
1,,Albania,1/22/20,0,0,41.1533,20.1683,0
2,,Algeria,1/22/20,0,0,28.0339,1.6596,0
3,,Andorra,1/22/20,0,0,42.5063,1.5218,0
4,,Angola,1/22/20,0,0,-11.2027,17.8739,0


In [16]:
# Get the population of the county data
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
df_pop = pd.read_csv( url )

In [17]:
df_pop.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Population
0,4,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan,38928341.0
1,8,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania,2877800.0
2,12,DZ,DZA,12.0,,,,Algeria,28.0339,1.6596,Algeria,43851043.0
3,20,AD,AND,20.0,,,,Andorra,42.5063,1.5218,Andorra,77265.0
4,24,AO,AGO,24.0,,,,Angola,-11.2027,17.8739,Angola,32866268.0


In [18]:
# Check if the name of Country matchs in two tables
for c in df_data['Country/Region'].unique():
  if c not in df_pop['Country_Region'].unique():
    print(c)

In [19]:
# rename the column name
df_data = df_data.rename(columns={"Country/Region":"Country_Region","Province/State": "Province_State"})

In [20]:
df_data1 = df_data.merge(df_pop[['Country_Region','Province_State','Population']], how='inner', on=['Country_Region','Province_State'])

In [21]:
df_data1[df_data1.Population.isnull()].Country_Region

31680    Diamond Princess
31681    Diamond Princess
31682    Diamond Princess
31683    Diamond Princess
31684    Diamond Princess
               ...       
56155          MS Zaandam
56156          MS Zaandam
56157          MS Zaandam
56158          MS Zaandam
56159          MS Zaandam
Name: Country_Region, Length: 720, dtype: object

In [22]:
df_data1.to_csv('covid19_confirmed_deaths_recovered_global.csv',index=False)

In [23]:
test = pd.read_csv('covid19_confirmed_deaths_recovered_global.csv' )

In [24]:
test.head()

Unnamed: 0,Province_State,Country_Region,date,confirmed,deaths,Lat,Long,recovered,Population
0,,Afghanistan,1/22/20,0,0,33.93911,67.709953,0,38928341.0
1,,Afghanistan,1/23/20,0,0,33.93911,67.709953,0,38928341.0
2,,Afghanistan,1/24/20,0,0,33.93911,67.709953,0,38928341.0
3,,Afghanistan,1/25/20,0,0,33.93911,67.709953,0,38928341.0
4,,Afghanistan,1/26/20,0,0,33.93911,67.709953,0,38928341.0
