In [219]:
import pandas as pd

In [220]:
cases = pd.read_csv('data/covid19/RAW_global_confirmed_cases.csv')
deaths = pd.read_csv('data/covid19/RAW_global_deaths.csv')

# Clean dataset

In [218]:
def preprocess(df):
    df = df.copy()
    
    # no need for geographic info
    df = df.drop(columns=['Lat','Long'])
    # let's merge all regions to the respective country
    df = df.groupby('Country/Region').sum()
    df.index.rename('Country', inplace=True)
    df = df.reset_index()
    # Let's turn this into tidy data format
    date_columns = df.columns[1:]
    df = pd.melt(df,id_vars=['Country'], value_vars=date_columns, var_name='date', value_name='value')
    df['date'] = pd.to_datetime(df.date)
    df['Country'] = df['Country'].astype('category')
    # set double index
    df = df.set_index(['date','Country']).sort_index()
    # cumsum to daily count
    df = df.groupby(level='Country').diff().fillna(0) # day0 becomes nan
    return df

deaths_cleaned = preprocess(deaths).rename(columns={'value':'deaths'})
cases_cleaned = preprocess(cases).rename(columns={'value':'confirmed_cases'})

cleaned_dataset = pd.concat([deaths_cleaned, cases_cleaned],axis='columns')
cleaned_dataset.head(3)
#cleaned_dataset.to_csv('data/covid19data_cleaned.csv')

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed_cases
date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,Afghanistan,0.0,0.0
2020-01-22,Albania,0.0,0.0
2020-01-22,Algeria,0.0,0.0


# How to read dataset

In [221]:
df = pd.read_csv('data/covid19data_cleaned.csv')
df.date = pd.to_datetime(df.date)
df = df.set_index(['date','Country']).sort_index() # very important to sort index
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed_cases
date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,Afghanistan,0.0,0.0
2020-01-22,Albania,0.0,0.0


# Example queries

In [228]:
idx = pd.IndexSlice # to help navigate in double-indexed dataframes

In [229]:
cleaned_dataset.loc[idx['1st april 2020':'1st may 2020',:],['confirmed_cases']].head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,confirmed_cases
date,Country,Unnamed: 2_level_1
2020-04-01,Afghanistan,63.0
2020-04-01,Albania,16.0
2020-04-01,Algeria,131.0


In [230]:
cleaned_dataset.loc[idx['17th april 2020'],:].nlargest(n=3,columns='deaths')

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed_cases
date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-17,US,2091.0,32806.0
2020-04-17,China,1290.0,357.0
2020-04-17,United Kingdom,914.0,4990.0


In [231]:
# create masks
week_42 = cleaned_dataset.index.get_level_values(0).week == 42
cleaned_dataset.loc[week_42].head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed_cases
date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-10-12,Afghanistan,2.0,71.0
2020-10-12,Albania,4.0,171.0
2020-10-12,Algeria,8.0,253.0


In [232]:
cleaned_dataset.groupby(level='date').agg({'deaths':['idxmax', 'max']}).sample(frac=1).head(3)

Unnamed: 0_level_0,deaths,deaths
Unnamed: 0_level_1,idxmax,max
date,Unnamed: 1_level_2,Unnamed: 2_level_2
2020-09-22,"(2020-09-22 00:00:00, India)",1085.0
2020-07-10,"(2020-07-10 00:00:00, Brazil)",1214.0
2020-11-13,"(2020-11-13 00:00:00, US)",1138.0


In [234]:
# Which country has the highest deaths each day
when_deaths_are_higher = cleaned_dataset.groupby(level='date').deaths.idxmax()
cleaned_dataset.loc[when_deaths_are_higher, ['deaths']].sample(frac=1).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths
date,Country,Unnamed: 2_level_1
2020-06-03,Brazil,1349.0
2020-07-26,India,711.0
2020-05-27,US,1508.0
