In [1]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#get data from Johns Hopkins University Center for Systems Science and Engineering (JHU CCSE)
def get_data(start_date, end_date):
    result = pd.DataFrame()
    temp_date = pd.to_datetime(start_date)
    e_date = pd.to_datetime(end_date)
    while temp_date <= e_date:
        try:
            temp_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{:02d}-{:02d}-{}.csv".format(temp_date.month, temp_date.day, temp_date.year))
            temp_df["date"] = temp_date
            result = result.append(temp_df, ignore_index = True)
        except:
            pass
        temp_date = temp_date + pd.Timedelta(days = 1)
    return result

In [26]:
df = get_data("2020-01-22", "today")
df

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,date
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,,2020-01-22
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,,2020-01-22
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,,2020-01-22
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,,2020-01-22
4,Gansu,Mainland China,1/22/2020 17:00,,,,2020-01-22
...,...,...,...,...,...,...,...
2374,"Seattle, WA",US,2020-02-09T07:03:04,1.0,0.0,1.0,2020-02-25
2375,"Tempe, AZ",US,2020-02-25T21:23:03,1.0,0.0,1.0,2020-02-25
2376,"Lackland, TX (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,2020-02-25
2377,"Omaha, NE (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,2020-02-25


In [55]:
df.date.dt.date

0       2020-01-22
1       2020-01-22
2       2020-01-22
3       2020-01-22
4       2020-01-22
           ...    
2374    2020-02-25
2375    2020-02-25
2376    2020-02-25
2377    2020-02-25
2378    2020-02-25
Name: date, Length: 2379, dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2379 entries, 0 to 2378
Data columns (total 7 columns):
Province/State    1718 non-null object
Country/Region    2379 non-null object
Last Update       2379 non-null object
Confirmed         2360 non-null float64
Deaths            1938 non-null float64
Recovered         1991 non-null float64
date              2379 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 130.2+ KB


In [6]:
df[df.date == "2020-02-25"]

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,date
2285,Hubei,Mainland China,2020-02-25T15:23:04,64786.0,2563.0,18971.0,2020-02-25
2286,Guangdong,Mainland China,2020-02-25T08:53:02,1347.0,7.0,822.0,2020-02-25
2287,Henan,Mainland China,2020-02-25T12:43:02,1271.0,19.0,1002.0,2020-02-25
2288,Zhejiang,Mainland China,2020-02-25T09:13:05,1205.0,1.0,808.0,2020-02-25
2289,Hunan,Mainland China,2020-02-25T15:03:05,1016.0,4.0,768.0,2020-02-25
...,...,...,...,...,...,...,...
2374,"Seattle, WA",US,2020-02-09T07:03:04,1.0,0.0,1.0,2020-02-25
2375,"Tempe, AZ",US,2020-02-25T21:23:03,1.0,0.0,1.0,2020-02-25
2376,"Lackland, TX (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,2020-02-25
2377,"Omaha, NE (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,2020-02-25


In [20]:
df_co = df[~df["Country/Region"].isin(["Mainland China", "Others"])].groupby(["Country/Region", "date"])[["Confirmed", "Deaths", "Recovered"]].sum().reset_index().sort_values(["Country/Region", "date"])
df_co.head()

Unnamed: 0,Country/Region,date,Confirmed,Deaths,Recovered
0,Afghanistan,2020-02-24,1.0,0.0,0.0
1,Afghanistan,2020-02-25,1.0,0.0,0.0
2,Algeria,2020-02-25,1.0,0.0,0.0
3,Australia,2020-01-23,0.0,0.0,0.0
4,Australia,2020-01-25,4.0,0.0,0.0


In [27]:
co_code = pd.read_csv("data/iso-country-codes.csv")
co_code.head()

Unnamed: 0,English short name lower case,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ
4,American Samoa,AS,ASM,16,ISO 3166-2:AS


In [29]:
df_merge = pd.merge(left = df_co, right = co_code, how = "left", left_on = "Country/Region", right_on = "English short name lower case")
df_merge

Unnamed: 0,Country/Region,date,Confirmed,Deaths,Recovered,English short name lower case,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Afghanistan,2020-02-24,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
1,Afghanistan,2020-02-25,1.0,0.0,0.0,Afghanistan,AF,AFG,4.0,ISO 3166-2:AF
2,Algeria,2020-02-25,1.0,0.0,0.0,Algeria,DZ,DZA,12.0,ISO 3166-2:DZ
3,Australia,2020-01-23,0.0,0.0,0.0,Australia,AU,AUS,36.0,ISO 3166-2:AU
4,Australia,2020-01-25,4.0,0.0,0.0,Australia,AU,AUS,36.0,ISO 3166-2:AU
...,...,...,...,...,...,...,...,...,...,...
869,Vietnam,2020-02-21,16.0,0.0,14.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
870,Vietnam,2020-02-22,16.0,0.0,14.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
871,Vietnam,2020-02-23,16.0,0.0,14.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN
872,Vietnam,2020-02-24,16.0,0.0,14.0,Vietnam,VN,VNM,704.0,ISO 3166-2:VN


In [37]:
unknown_co = list(df_merge[df_merge["Alpha-2 code"].isnull()]["Country/Region"].unique())
code_list = ["CI", "MO", "KR", "GB", "US"]
mapper = dict(zip(unknown_co, code_list))
mapper

In [42]:
# add countrycode manually for unmapped country
df_merge.loc[df_merge["Country/Region"].isin(unknown_co), "Alpha-2 code"] = df_merge.loc[df_merge["Country/Region"].isin(unknown_co), "Country/Region"].map(mapper)

In [57]:
df_merge["Image"] = "https://www.countryflags.io/"+df_merge["Alpha-2 code"]+"/shiny/64.png"
df_merge["date"] = df_merge.date.dt.date

In [58]:
confirmed = df_merge.pivot_table(index = ["Country/Region", "Alpha-2 code", "Image"], columns = "date", values = "Confirmed", aggfunc = "sum", fill_value = 0)
confirmed.to_excel("confirmed.xlsx")