In [1]:
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go

In [2]:
covid_data = pd.read_csv('rawdata/WHO-COVID-19-global-data.csv')

In [3]:
covid_data

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020/1/3,AF,Afghanistan,EMRO,0,0,0,0
1,2020/1/4,AF,Afghanistan,EMRO,0,0,0,0
2,2020/1/5,AF,Afghanistan,EMRO,0,0,0,0
3,2020/1/6,AF,Afghanistan,EMRO,0,0,0,0
4,2020/1/7,AF,Afghanistan,EMRO,0,0,0,0
...,...,...,...,...,...,...,...,...
185803,2022/2/20,ZW,Zimbabwe,AFRO,194,233224,1,5386
185804,2022/2/21,ZW,Zimbabwe,AFRO,128,233352,0,5386
185805,2022/2/22,ZW,Zimbabwe,AFRO,219,233571,0,5386
185806,2022/2/23,ZW,Zimbabwe,AFRO,409,233980,2,5388


Country_code in the original dataset is two-letter. Need to convert to three-letter ISO country code for Plotly built-in countries geometry. The mapping used "countries_codes_and_coordinates.csv" in rawdata folder

In [4]:
country_code = pd.read_csv("rawdata/countries_codes_and_coordinates_new.csv")

In [5]:
country_code

Unnamed: 0,Country,Alpha-2code,Alpha-3code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33.0000,65.0
1,Albania,AL,ALB,8,41.0000,20.0
2,Algeria,DZ,DZA,12,28.0000,3.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0
4,Andorra,AD,AND,20,42.5000,1.6
...,...,...,...,...,...,...
251,Wallis and Futuna,WF,WLF,876,-13.3000,-176.2
252,Western Sahara,EH,ESH,732,24.5000,-13.0
253,Yemen,YE,YEM,887,15.0000,48.0
254,Zambia,ZM,ZMB,894,-15.0000,30.0


Clean the data:
1. Join two datasets on 2 digit code (not on country, because country names may be different, e.g. "United States of America" in covid_data but "United States" in country_code)
2. Align country name to the standard names in "countries_codes_and_coordinates.csv"
3. Keep the fields of  "country_name", "Date_reported", "Cumulative_cases", "Cumulative_deaths", "Alpha-3code"
4. Since the project focuses on showing cumulative cases and deaths, filter out Date_reported as "2020-12-31" for all countries

In [6]:
country_code = country_code.rename({'Country': 'country_name'}, axis = 1)

In [7]:
country_code

Unnamed: 0,country_name,Alpha-2code,Alpha-3code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33.0000,65.0
1,Albania,AL,ALB,8,41.0000,20.0
2,Algeria,DZ,DZA,12,28.0000,3.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0
4,Andorra,AD,AND,20,42.5000,1.6
...,...,...,...,...,...,...
251,Wallis and Futuna,WF,WLF,876,-13.3000,-176.2
252,Western Sahara,EH,ESH,732,24.5000,-13.0
253,Yemen,YE,YEM,887,15.0000,48.0
254,Zambia,ZM,ZMB,894,-15.0000,30.0


In [8]:
covid_data.join(country_code.set_index('Alpha-2code'), on = 'Country_code')

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths,country_name,Alpha-3code,Numeric code,Latitude (average),Longitude (average)
0,2020/1/3,AF,Afghanistan,EMRO,0,0,0,0,Afghanistan,AFG,4.0,33.0,65.0
1,2020/1/4,AF,Afghanistan,EMRO,0,0,0,0,Afghanistan,AFG,4.0,33.0,65.0
2,2020/1/5,AF,Afghanistan,EMRO,0,0,0,0,Afghanistan,AFG,4.0,33.0,65.0
3,2020/1/6,AF,Afghanistan,EMRO,0,0,0,0,Afghanistan,AFG,4.0,33.0,65.0
4,2020/1/7,AF,Afghanistan,EMRO,0,0,0,0,Afghanistan,AFG,4.0,33.0,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185803,2022/2/20,ZW,Zimbabwe,AFRO,194,233224,1,5386,Zimbabwe,ZWE,716.0,-20.0,30.0
185804,2022/2/21,ZW,Zimbabwe,AFRO,128,233352,0,5386,Zimbabwe,ZWE,716.0,-20.0,30.0
185805,2022/2/22,ZW,Zimbabwe,AFRO,219,233571,0,5386,Zimbabwe,ZWE,716.0,-20.0,30.0
185806,2022/2/23,ZW,Zimbabwe,AFRO,409,233980,2,5388,Zimbabwe,ZWE,716.0,-20.0,30.0


In [9]:
df = covid_data.join(country_code.set_index('Alpha-2code'), on = 'Country_code')

In [87]:
dff = df[["country_name", "Date_reported", "Cumulative_cases", "Cumulative_deaths", "Alpha-3code"]]
dic = {'2020/3/31': '2020Q1', '2020/6/30': '2020Q2', '2020/9/30': '2020Q3', '2020/12/31': '2020Q4'}
dff = dff[dff.Date_reported.isin(dic.keys())]
dff

Unnamed: 0,country_name,Date_reported,Cumulative_cases,Cumulative_deaths,Alpha-3code
88,Afghanistan,2020/3/31,166,4,AFG
179,Afghanistan,2020/6/30,31445,739,AFG
271,Afghanistan,2020/9/30,39354,1462,AFG
363,Afghanistan,2020/12/31,52330,2189,AFG
872,Albania,2020/3/31,205,12,ALB
...,...,...,...,...,...
184603,Zambia,2020/12/31,20462,386,ZMB
185112,Zimbabwe,2020/3/31,8,1,ZWE
185203,Zimbabwe,2020/6/30,574,7,ZWE
185295,Zimbabwe,2020/9/30,7837,228,ZWE


In [89]:
def f(row):
    if row['Date_reported'] == '2020/3/31':
        val = '2020Q1'
    elif row['Date_reported'] == '2020/6/30':
        val = '2020Q2'
    elif row['Date_reported'] == '2020/9/30':
        val = '2020Q3'
    else:
        val = '2020Q4'
    return val

dff['Quarter'] = dff.apply(f, axis=1)

dff

Unnamed: 0,country_name,Date_reported,Cumulative_cases,Cumulative_deaths,Alpha-3code,Quarter
88,Afghanistan,2020/3/31,166,4,AFG,2020Q1
179,Afghanistan,2020/6/30,31445,739,AFG,2020Q2
271,Afghanistan,2020/9/30,39354,1462,AFG,2020Q3
363,Afghanistan,2020/12/31,52330,2189,AFG,2020Q4
872,Albania,2020/3/31,205,12,ALB,2020Q1
...,...,...,...,...,...,...
184603,Zambia,2020/12/31,20462,386,ZMB,2020Q4
185112,Zimbabwe,2020/3/31,8,1,ZWE,2020Q1
185203,Zimbabwe,2020/6/30,574,7,ZWE,2020Q2
185295,Zimbabwe,2020/9/30,7837,228,ZWE,2020Q3
