# Data wrangling for European Centre for Disease Prevention and Control

## Import libraries

In [18]:
import pandas as pd
import numpy as np

## Load data from open data

In [46]:
url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
0,05/04/2020,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0
1,04/04/2020,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0
2,03/04/2020,3,4,2020,43,0,Afghanistan,AF,AFG,37172386.0
3,02/04/2020,2,4,2020,26,0,Afghanistan,AF,AFG,37172386.0
4,01/04/2020,1,4,2020,25,0,Afghanistan,AF,AFG,37172386.0


In [47]:
df_rn =(
    data
    .assign(date=pd.to_datetime(data.dateRep))
    .assign(country=data.countriesAndTerritories)
    .assign(confirmed_dayli=data.cases)
    .assign(dead_dayli=data.deaths)
    .drop(columns=[
        "countriesAndTerritories", 
        "cases", 
        "deaths", 
        "dateRep", 
        "day", 
        "month", 
        "year",
        'geoId', 
        'countryterritoryCode']
    )
    .sort_values(["country", "date"])
)
df_rn.describe()

Unnamed: 0,popData2018,confirmed_dayli,dead_dayli
count,8873.0,8905.0,8905.0
mean,65495050.0,131.909264,7.231892
std,203789400.0,1041.73709,62.844102
min,1000.0,-9.0,0.0
25%,3731000.0,0.0,0.0
50%,10625700.0,0.0,0.0
75%,44494500.0,10.0,0.0
max,1392730000.0,34272.0,2004.0


In [48]:
df_rn

Unnamed: 0,popData2018,date,country,confirmed_dayli,dead_dayli
86,37172386.0,2019-12-31,Afghanistan,0,0
85,37172386.0,2020-01-01,Afghanistan,0,0
54,37172386.0,2020-01-02,Afghanistan,0,0
25,37172386.0,2020-01-03,Afghanistan,0,0
4,37172386.0,2020-01-04,Afghanistan,25,0
...,...,...,...,...,...
8896,14439018.0,2020-03-29,Zimbabwe,2,0
8895,14439018.0,2020-03-30,Zimbabwe,0,0
8894,14439018.0,2020-03-31,Zimbabwe,0,0
8890,14439018.0,2020-04-04,Zimbabwe,1,0


In [54]:
df_world = (
    df_rn.set_index('date')
    .resample('d')
    .sum()
    .reset_index()
    .sort_values('date')
    .assign(countriesAndTerritories='World')
)
df_world = df_world.assign(popData2018=df_world.popData2018.max())
df_world

Unnamed: 0,date,popData2018,confirmed_dayli,dead_dayli,countriesAndTerritories
0,2019-12-31,7.498379e+09,27,0,World
1,2020-01-01,7.498379e+09,0,0,World
2,2020-01-02,7.498379e+09,2120,46,World
3,2020-01-03,7.498379e+09,1821,58,World
4,2020-01-04,7.498379e+09,73512,4614,World
...,...,...,...,...,...
334,2020-11-29,7.498379e+09,0,0,World
335,2020-11-30,7.498379e+09,0,0,World
336,2020-12-01,7.498379e+09,0,0,World
337,2020-12-02,7.498379e+09,2072,97,World


In [32]:
df_all = pd.concat((data, df_world))
df_all = (
    df_all
    .assign(country=df_all.countriesAndTerritories)
    .assign(confirmed_dayli=df_world.cases)
    .assign(dead_dayli=df_world.deaths)
    .drop(columns=["countriesAndTerritories", "cases", "deaths"])
    .sort_values(["country", "date"])
)
df_all.describe()

Unnamed: 0,popData2018,confirmed_dayli,dead_dayli
count,9212.0,678.0,678.0
mean,339023900.0,3465.050147,189.970501
std,1413685000.0,13127.199554,802.707703
min,1000.0,0.0,0.0
25%,4089400.0,0.0,0.0
50%,10727670.0,0.0,0.0
75%,51635260.0,0.75,0.0
max,7498379000.0,92598.0,6627.0


In [34]:
df_all[df_all.confirmed_dayli < 0]

Unnamed: 0,geoId,countryterritoryCode,popData2018,date,country,confirmed_dayli,dead_dayli


In [35]:
df_all

Unnamed: 0,geoId,countryterritoryCode,popData2018,date,country,confirmed_dayli,dead_dayli
86,AF,AFG,37172386.0,2019-12-31,Afghanistan,51204.0,2426.0
85,AF,AFG,37172386.0,2020-01-01,Afghanistan,38927.0,2200.0
54,AF,AFG,37172386.0,2020-01-02,Afghanistan,1008.0,104.0
25,AF,AFG,37172386.0,2020-01-03,Afghanistan,453.0,15.0
4,AF,AFG,37172386.0,2020-01-04,Afghanistan,73512.0,4614.0
...,...,...,...,...,...,...,...
8896,ZW,ZWE,14439018.0,2020-03-29,Zimbabwe,,
8895,ZW,ZWE,14439018.0,2020-03-30,Zimbabwe,,
8894,ZW,ZWE,14439018.0,2020-03-31,Zimbabwe,,
8890,ZW,ZWE,14439018.0,2020-04-04,Zimbabwe,,


In [36]:
df_all['confirmed_total'] = (
    df_all
    .groupby(["country", "date"])['confirmed_dayli'].apply(lambda x: x.cumsum())
)

In [37]:
df_all

Unnamed: 0,geoId,countryterritoryCode,popData2018,date,country,confirmed_dayli,dead_dayli,confirmed_total
86,AF,AFG,37172386.0,2019-12-31,Afghanistan,51204.0,2426.0,51204.0
85,AF,AFG,37172386.0,2020-01-01,Afghanistan,38927.0,2200.0,38927.0
54,AF,AFG,37172386.0,2020-01-02,Afghanistan,1008.0,104.0,1008.0
25,AF,AFG,37172386.0,2020-01-03,Afghanistan,453.0,15.0,453.0
4,AF,AFG,37172386.0,2020-01-04,Afghanistan,73512.0,4614.0,73512.0
...,...,...,...,...,...,...,...,...
8896,ZW,ZWE,14439018.0,2020-03-29,Zimbabwe,,,
8895,ZW,ZWE,14439018.0,2020-03-30,Zimbabwe,,,
8894,ZW,ZWE,14439018.0,2020-03-31,Zimbabwe,,,
8890,ZW,ZWE,14439018.0,2020-04-04,Zimbabwe,,,
