# Data wrangling for covid data

## Import libraries

In [31]:
import pandas as pd
import numpy as np

## Load data from github

In [32]:
data = pd.read_csv(
    "https://raw.githubusercontent.com/coviddata/coviddata/master/data/sources/jhu_csse/standardized/standardized.csv"
)
data.date = pd.to_datetime(data.date)

In [33]:
data.head()

Unnamed: 0,date,country,region,place,cases,deaths,recoveries
0,2020-01-22,Mainland China,Anhui,,1,0,0
1,2020-01-22,Mainland China,Beijing,,14,0,0
2,2020-01-22,Mainland China,Chongqing,,6,0,0
3,2020-01-22,Mainland China,Fujian,,1,0,0
4,2020-01-22,Mainland China,Gansu,,0,0,0


In [34]:
data.describe()

Unnamed: 0,cases,deaths,recoveries
count,48552.0,48552.0,48552.0
mean,281.07493,12.204214,76.589656
std,3396.702516,240.281361,1544.60278
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,2.0,0.0,0.0
75%,17.0,0.0,0.0
max,119827.0,14681.0,63612.0


In [35]:
df_world = data.drop(
    columns=['country', 'region', 'place']
).set_index(
    'date'
).resample(
    'd'
).sum(
    min_count=180
).reset_index(

).assign(
    country='World',
    region=np.nan,
    place=np.nan    
)
df_world

Unnamed: 0,date,cases,deaths,recoveries,country,region,place
0,2020-01-22,,,,World,,
1,2020-01-23,,,,World,,
2,2020-01-24,,,,World,,
3,2020-01-25,,,,World,,
4,2020-01-26,,,,World,,
...,...,...,...,...,...,...,...
68,2020-03-30,782365.0,37582.0,164566.0,World,,
69,2020-03-31,857487.0,42107.0,178034.0,World,,
70,2020-04-01,932605.0,46809.0,193177.0,World,,
71,2020-04-02,1013157.0,52983.0,210263.0,World,,


In [36]:
df_world = pd.concat((data, df_world)).sort_values(['country', 'date'])
df_world.describe()

Unnamed: 0,cases,deaths,recoveries
count,48581.0,48581.0,48581.0
mean,517.0431,23.084086,140.260781
std,12625.15,640.25324,3242.042092
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,2.0,0.0,0.0
75%,17.0,0.0,0.0
max,1095917.0,58787.0,225796.0


In [41]:
df_grouped = (
    df_world    
    .assign(confirmed_total=df_world.cases)
    .assign(recovered_total=df_world.recoveries)
    .assign(dead_total=df_world.deaths)
    .drop(columns=["cases", "recoveries", "deaths"])
    .assign(
        geo_key=df_world.country.str.cat(
            df_world.region.str.cat(
                df_world.place, 
                sep='_',
                na_rep='na'
            ), 
            sep='_',
            na_rep='na'
        )
    )
    .sort_values(["geo_key", "date"])
    .groupby(["geo_key"])
)
df_grouped.head()

Unnamed: 0,date,country,region,place,confirmed_total,recovered_total,dead_total,geo_key
8777,2020-03-22,US,Alabama,Autauga,0.0,0.0,0.0,US_Alabama_Autauga
11144,2020-03-23,US,Alabama,Autauga,0.0,0.0,0.0,US_Alabama_Autauga
14559,2020-03-24,US,Alabama,Autauga,1.0,0.0,0.0,US_Alabama_Autauga
17976,2020-03-25,US,Alabama,Autauga,4.0,0.0,0.0,US_Alabama_Autauga
21396,2020-03-26,US,Alabama,Autauga,6.0,0.0,0.0,US_Alabama_Autauga
...,...,...,...,...,...,...,...,...
12,2020-01-22,Hong Kong,Hong Kong,,0.0,0.0,0.0,
35,2020-01-22,Japan,,,2.0,0.0,0.0,
20,2020-01-22,Macau,Macau,,1.0,0.0,0.0,
0,2020-01-22,Mainland China,Anhui,,1.0,0.0,0.0,


In [None]:
df_new_column = (
    df_world    
    .assign(confirmed_total=df_world.cases)
    .assign(recovered_total=df_world.recoveries)
    .assign(dead_total=df_world.deaths)
    .sort_values(["country", "region", "place", "date"])
    .groupby(["country", "region", "place", "date"])
    .assign(
        confirmed_dayli=df_no_region.cases.subtract(
            df_no_region.confirmed.shift(1), fill_value=0
        ).clip(0)
    )
    .assign(
        recovered_dayli=df_no_region.recoveries.subtract(
            df_no_region.recovered.shift(1), fill_value=0
        ).clip(0)
    )
    .assign(
        dead_dayli=df_no_region.deaths.subtract(
            df_no_region.dead.shift(1), fill_value=0
        ).clip(0)
    )
    .reset_index()
    .drop(columns=["cases", "recoveries", "deaths"])
)
df_new_column.describe()