In [1]:
import pandas as pd
import numpy as np

# Preprocessing Sources

We will use data from the [Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) repo at GitHub.

## Deaths dataset

In [2]:
deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')

In [3]:
deaths.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/4/21,5/5/21,5/6/21,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,2648,2654,2664,2673,2683,2686,2698,2710,2713,2721
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2402,2403,2406,2408,2411,2412,2416,2420,2423,2426
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,3289,3299,3307,3315,3321,3328,3335,3343,3350,3355
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,127,127,127,127,127,127,127,127,127,127
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,609,618,622,628,630,633,636,639,645,649


Dropping unnecessary features as we'll work with country-wise information.

In [4]:
deaths.drop(columns=['Province/State'], inplace=True)

### Changing data display from wide format to long format

This will be useful later on for easier data viz.

It's simply a reshape of the dataframe grabbing all the dates columns and turning them into one `Date` column and using each of their values for a new column called `Deaths`.

In [5]:
dates_list = deaths.columns.drop(['Country/Region', 'Lat', 'Long' ])

deaths_df = deaths.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Deaths')

deaths_df.head()

Unnamed: 0,Country/Region,Lat,Long,Date,Deaths
0,Afghanistan,33.93911,67.709953,1/22/20,0
1,Albania,41.1533,20.1683,1/22/20,0
2,Algeria,28.0339,1.6596,1/22/20,0
3,Andorra,42.5063,1.5218,1/22/20,0
4,Angola,-11.2027,17.8739,1/22/20,0


As some countries have information per `Province/State` we need to group information by `Country/Region` and `Date` to get the correspondent total `Deaths`.

In [6]:
deaths_grouped = deaths_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Deaths': sum
})

## Confirmed cases data set

Given that's the same format as in the `deaths` dataset, we'll perform the same operations.

In [7]:
confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/4/21,5/5/21,5/6/21,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,60563,60797,61162,61455,61755,61842,62063,62403,62718,63045
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,131327,131419,131510,131577,131666,131723,131753,131803,131845,131890
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,122999,123272,123473,123692,123900,124104,124288,124483,124682,124889
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,13316,13340,13363,13390,13406,13423,13429,13447,13470,13470
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,27284,27529,27921,28201,28477,28740,28875,29146,29405,29695


In [8]:
confirmed.drop(columns=['Province/State'], inplace=True)

confirmed_df = confirmed.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Confirmed')


confirmed_grouped = confirmed_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Confirmed': sum
})

## Recovered cases data set

In [9]:
recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [10]:
recovered.drop(columns=['Province/State'], inplace=True)

recovered_df = recovered.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Recovered')


recovered_grouped = recovered_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Recovered': sum
})

# Merging data

In [11]:
recovered_grouped.shape, deaths_grouped.shape, confirmed_grouped.shape 

((91776, 5), (91776, 5), (91776, 5))

In [12]:
data = recovered_grouped.merge(deaths_grouped, on=['Country/Region', 'Lat', 'Long', 'Date']).merge(confirmed_grouped, on=['Country/Region', 'Lat', 'Long', 'Date'])

In [13]:
data.sample(5)

Unnamed: 0,Country/Region,Date,Lat,Long,Recovered,Deaths,Confirmed
13299,Burma,7/16/20,21.9162,95.956,270,6,339
60006,Nigeria,4/11/20,9.082,8.6753,70,10,318
64781,Poland,3/9/21,51.9194,19.1451,1503353,45599,1811036
23379,Dominica,8/26/20,15.415,-61.371,18,0,20
36576,Iceland,3/8/20,64.9631,-19.0208,0,0,50


In [14]:
data['Country/Region'].nunique()

187

# Combining our data with total population for each country

We get another dataset containing the population for each country in order to be able to present data normalized by inhabitants.

Source: [World Population Review](https://worldpopulationreview.com/countries)

In [15]:
population = pd.read_csv('population.csv').drop(columns='Rank')

population['pop2021'] = population['pop2021'] * 1000
population['pop2020'] = population['pop2020'] * 1000

Unnamed: 0,name,pop2021,pop2020,GrowthRate,area,Density
0,China,1444216.107,1439323.776,1.0034,9706961,147.7068
1,India,1393409.038,1380004.385,1.0097,3287590,415.6290
2,United States,332915.073,331002.651,1.0058,9372610,35.1092
3,Indonesia,276361.783,273523.615,1.0104,1904569,142.0928
4,Pakistan,225199.937,220892.340,1.0195,881912,245.5634
...,...,...,...,...,...,...
227,Montserrat,4.977,4.992,0.9970,102,48.9118
228,Falkland Islands,3.533,3.480,1.0152,12173,0.2774
229,Niue,1.619,1.626,0.9957,260,6.2115
230,Tokelau,1.373,1.357,1.0118,12,111.6667


In [18]:
population

Unnamed: 0,name,pop2021,pop2020,GrowthRate,area,Density
0,China,1.444216e+09,1.439324e+09,1.0034,9706961,147.7068
1,India,1.393409e+09,1.380004e+09,1.0097,3287590,415.6290
2,United States,3.329151e+08,3.310027e+08,1.0058,9372610,35.1092
3,Indonesia,2.763618e+08,2.735236e+08,1.0104,1904569,142.0928
4,Pakistan,2.251999e+08,2.208923e+08,1.0195,881912,245.5634
...,...,...,...,...,...,...
227,Montserrat,4.977000e+03,4.992000e+03,0.9970,102,48.9118
228,Falkland Islands,3.533000e+03,3.480000e+03,1.0152,12173,0.2774
229,Niue,1.619000e+03,1.626000e+03,0.9957,260,6.2115
230,Tokelau,1.373000e+03,1.357000e+03,1.0118,12,111.6667


In [21]:
 [country for country in data['Country/Region'].unique() if country not in population.name.unique()]

['Burma',
 'Cabo Verde',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 "Cote d'Ivoire",
 'Czechia',
 'Diamond Princess',
 'Eswatini',
 'Holy See',
 'Korea, South',
 'Kosovo',
 'MS Zaandam',
 'Taiwan*',
 'US',
 'West Bank and Gaza']