In [1]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#get data from Johns Hopkins University Center for Systems Science and Engineering (JHU CCSE)
def get_data(start_date, end_date):
    result = pd.DataFrame()
    temp_date = pd.to_datetime(start_date)
    e_date = pd.to_datetime(end_date)
    while temp_date <= e_date:
        try:
            temp_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{:02d}-{:02d}-{}.csv".format(temp_date.month, temp_date.day, temp_date.year))
            temp_df["date"] = temp_date
            result = result.append(temp_df, ignore_index = True)
        except:
            pass
        temp_date = temp_date + pd.Timedelta(days = 1)
    return result

In [3]:
df = get_data("2020-01-22", "today")
df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,Confirmed,Country/Region,Deaths,Last Update,Latitude,Longitude,Province/State,Recovered,date
0,1.0,Mainland China,,1/22/2020 17:00,,,Anhui,,2020-01-22
1,14.0,Mainland China,,1/22/2020 17:00,,,Beijing,,2020-01-22
2,6.0,Mainland China,,1/22/2020 17:00,,,Chongqing,,2020-01-22
3,1.0,Mainland China,,1/22/2020 17:00,,,Fujian,,2020-01-22
4,,Mainland China,,1/22/2020 17:00,,,Gansu,,2020-01-22
...,...,...,...,...,...,...,...,...,...
4242,0.0,Australia,0.0,2020-03-06T04:33:03,-12.4634,130.8456,Northern Territory,0.0,2020-03-08
4243,0.0,US,0.0,2020-02-24T23:33:02,29.3829,-98.6134,"Lackland, TX (From Diamond Princess)",0.0,2020-03-08
4244,0.0,US,0.0,2020-03-07T19:53:02,30.3213,-95.4778,"Montgomery County, TX",0.0,2020-03-08
4245,0.0,US,0.0,2020-02-24T23:33:02,41.2545,-95.9758,"Omaha, NE (From Diamond Princess)",0.0,2020-03-08


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4247 entries, 0 to 4246
Data columns (total 9 columns):
Confirmed         4228 non-null float64
Country/Region    4247 non-null object
Deaths            3806 non-null float64
Last Update       4247 non-null object
Latitude          1429 non-null float64
Longitude         1429 non-null float64
Province/State    2749 non-null object
Recovered         3859 non-null float64
date              4247 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(3)
memory usage: 298.7+ KB


In [5]:
df_co = df[~df["Country/Region"].isin(["Mainland China", "Others"])].groupby(["Country/Region", "date"])[["Confirmed", "Deaths", "Recovered"]].sum().reset_index().sort_values(["Country/Region", "date"])
df_co.head()

Unnamed: 0,Country/Region,date,Confirmed,Deaths,Recovered
0,Azerbaijan,2020-02-28,1.0,0.0,0.0
1,Afghanistan,2020-02-24,1.0,0.0,0.0
2,Afghanistan,2020-02-25,1.0,0.0,0.0
3,Afghanistan,2020-02-26,1.0,0.0,0.0
4,Afghanistan,2020-02-27,1.0,0.0,0.0


In [6]:
co_code = pd.read_csv("data/iso-country-codes.csv")
co_code.head()

Unnamed: 0,English short name lower case,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ
4,American Samoa,AS,ASM,16,ISO 3166-2:AS


In [29]:
df_merge = pd.merge(left = df_co, right = co_code, how = "left", left_on = "Country/Region", right_on = "English short name lower case")
df_merge

Unnamed: 0,Country/Region,date,Confirmed,Deaths,Recovered,English short name lower case,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Azerbaijan,2020-02-28,1.0,0.0,0.0,,,,,
1,Afghanistan,2020-02-24,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
2,Afghanistan,2020-02-25,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
3,Afghanistan,2020-02-26,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
4,Afghanistan,2020-02-27,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
...,...,...,...,...,...,...,...,...,...,...
1778,Vietnam,2020-03-04,16.0,0.0,16.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
1779,Vietnam,2020-03-05,16.0,0.0,16.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
1780,Vietnam,2020-03-06,16.0,0.0,16.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
1781,Vietnam,2020-03-07,18.0,0.0,16.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN


In [30]:
unknown_co = list(df_merge[df_merge["Alpha-2 code"].isnull()]["Country/Region"].unique())
mapper = {' Azerbaijan': 'AZ', 
    'Ivory Coast': 'CI',
    'Macau': 'MO',
    'Moldova': 'MD',
    'North Ireland': '',
    'Palestine': 'PS',
    'Republic of Ireland': 'IE',
    'Saint Barthelemy': 'BL',
    'North Macedonia': 'MK',
    'South Korea': 'KR',
    'UK': 'UK',
    'US': 'US',
    'Vatican City': 'VA'}
for i in unknown_co:
    print(i)
    print(mapper[i])

 Azerbaijan
AZ
Ivory Coast
CI
Macau
MO
Moldova
MD
North Ireland

North Macedonia
MK
Palestine
PS
Republic of Ireland
IE
Saint Barthelemy
BL
South Korea
KR
UK
UK
US
US
Vatican City
VA


In [32]:
for i in unknown_co:
    print(i)

 Azerbaijan
Ivory Coast
Macau
Moldova
North Ireland
North Macedonia
Palestine
Republic of Ireland
Saint Barthelemy
South Korea
UK
US
Vatican City


In [33]:
# add countrycode manually for unmapped country
df_merge.loc[df_merge["Country/Region"].isin(unknown_co), "Alpha-2 code"] = df_merge.loc[df_merge["Country/Region"].isin(unknown_co), "Country/Region"].map(mapper)

In [34]:
df_merge["Image"] = "https://www.countryflags.io/"+df_merge["Alpha-2 code"]+"/shiny/64.png"
df_merge["date"] = df_merge.date.dt.date
df_merge.to_excel("df_merge.xlsx")

In [35]:
confirmed = df_merge.pivot_table(index = ["Country/Region", "Alpha-2 code", "Image"], columns = "date", values = "Confirmed", aggfunc = "sum", fill_value = 0)
confirmed.to_excel("confirmed.xlsx")