In [1]:
import pandas as pd
import numpy as np

# Preprocessing Sources

We will use data from the [Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) repo at GitHub.

## Deaths dataset

In [2]:
deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')

In [3]:
deaths.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21,5/14/21,5/15/21,5/16/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,2673,2683,2686,2698,2710,2713,2721,2730,2733,2742
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2408,2411,2412,2416,2420,2423,2426,2427,2429,2432
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,3315,3321,3328,3335,3343,3350,3355,3360,3366,3374
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,127,127,127,127,127,127,127,127,127,127
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,628,630,633,636,639,645,649,651,655,659


Dropping unnecessary features as we'll work with country-wise information.

In [4]:
deaths.drop(columns=['Province/State'], inplace=True)

### Changing data display from wide format to long format

This will be useful later on for easier data viz.

It's simply a reshape of the dataframe grabbing all the dates columns and turning them into one `Date` column and using each of their values for a new column called `Deaths`.

In [5]:
dates_list = deaths.columns.drop(['Country/Region', 'Lat', 'Long' ])

deaths_df = deaths.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Deaths')

deaths_df.head()

Unnamed: 0,Country/Region,Lat,Long,Date,Deaths
0,Afghanistan,33.93911,67.709953,1/22/20,0
1,Albania,41.1533,20.1683,1/22/20,0
2,Algeria,28.0339,1.6596,1/22/20,0
3,Andorra,42.5063,1.5218,1/22/20,0
4,Angola,-11.2027,17.8739,1/22/20,0


As some countries have information per `Province/State` we need to group information by `Country/Region` and `Date` to get the correspondent total `Deaths`.

In [6]:
deaths_grouped = deaths_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Deaths': sum
})

## Confirmed cases data set

Given that's the same format as in the `deaths` dataset, we'll perform the same operations.

In [7]:
confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21,5/14/21,5/15/21,5/16/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,61455,61755,61842,62063,62403,62718,63045,63355,63412,63484
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,131577,131666,131723,131753,131803,131845,131890,131939,131978,132015
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,123692,123900,124104,124288,124483,124682,124889,125059,125194,125311
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,13390,13406,13423,13429,13447,13470,13470,13510,13510,13510
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,28201,28477,28740,28875,29146,29405,29695,30030,30354,30637


In [8]:
confirmed.drop(columns=['Province/State'], inplace=True)

confirmed_df = confirmed.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Confirmed')


confirmed_grouped = confirmed_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Confirmed': sum
})

## Recovered cases data set

In [9]:
recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [10]:
recovered.drop(columns=['Province/State'], inplace=True)

recovered_df = recovered.melt(id_vars=['Country/Region', 'Lat', 'Long' ], value_vars=dates_list,
           var_name='Date', value_name='Recovered')


recovered_grouped = recovered_df.groupby(by=['Country/Region', 'Date'], as_index=False).agg({
    'Lat': np.mean, # We'll use the mean to get a correct location to be used in Tableau
    'Long': np.mean,
    'Recovered': sum
})

# Merging data

In [11]:
recovered_grouped.shape, deaths_grouped.shape, confirmed_grouped.shape 

((92352, 5), (92352, 5), (92352, 5))

In [12]:
data = recovered_grouped.merge(deaths_grouped, on=['Country/Region', 'Lat', 'Long', 'Date']).merge(confirmed_grouped, on=['Country/Region', 'Lat', 'Long', 'Date'])

In [13]:
data.sample(5)

Unnamed: 0,Country/Region,Date,Lat,Long,Recovered,Deaths,Confirmed
64020,Peru,10/15/20,-9.19,-75.0152,759597,33577,859740
21046,Cyprus,6/13/20,35.1264,33.4299,807,18,980
55457,Monaco,2/13/21,43.7333,7.4167,1510,21,1755
30259,Gambia,8/25/20,13.4432,-15.3101,601,90,2686
45084,Laos,5/6/20,19.85627,102.495496,10,0,19


In [14]:
data['Country/Region'].nunique()

187

# Combining our data with total population for each country

We get another dataset containing the population for each country in order to be able to present data normalized by inhabitants.

Source: [World Population Review](https://worldpopulationreview.com/countries)

In [15]:
population = pd.read_csv('population.csv').drop(columns='Rank')

population['pop2021'] = population['pop2021'] * 1000
population['pop2020'] = population['pop2020'] * 1000

In [16]:
population.head()

Unnamed: 0,name,pop2021,pop2020,GrowthRate,area,Density
0,China,1444216000.0,1439324000.0,1.0034,9706961,147.7068
1,India,1393409000.0,1380004000.0,1.0097,3287590,415.629
2,United States,332915100.0,331002700.0,1.0058,9372610,35.1092
3,Indonesia,276361800.0,273523600.0,1.0104,1904569,142.0928
4,Pakistan,225199900.0,220892300.0,1.0195,881912,245.5634


We'll check if every country's name is spelled the same in both datasets.

In [17]:
 [country for country in data['Country/Region'].unique() if country not in population.name.unique()]

['Burma',
 'Cabo Verde',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 "Cote d'Ivoire",
 'Czechia',
 'Diamond Princess',
 'Eswatini',
 'Holy See',
 'Korea, South',
 'Kosovo',
 'MS Zaandam',
 'Taiwan*',
 'US',
 'West Bank and Gaza']

We'll need to manually input the names in our source dataset in order to match the population one.

In [18]:
# We'll get rid of the Diamond Princess and MS Zaandam as we'll only consider countries.
diamond = data[data['Country/Region'].str.contains('Diamond Princess') == True].index
data.drop(diamond, inplace=True)
zaandam = data[data['Country/Region'].str.contains('MS Zaandam') == True].index
data.drop(zaandam, inplace=True)
# We'll drop territories under dispute as they may be duplicating data.
kosovo = data[data['Country/Region'].str.contains('Kosovo') == True].index
data.drop(kosovo, inplace=True)
west_bank = data[data['Country/Region'].str.contains('West Bank and Gaza') == True].index
data.drop(west_bank, inplace=True)

In [19]:
data['Country/Region'] = data['Country/Region'].str.replace('Burma', 'Myanmar')
data['Country/Region'] = data['Country/Region'].str.replace('Cabo Verde', 'Cape Verde')
data['Country/Region'] = data['Country/Region'].str.replace('''Congo (Brazzaville)''', 'Republic of the Congo',
                                                            regex=False)
data['Country/Region'] = data['Country/Region'].str.replace('''Congo (Kinshasa)''', 'DR Congo',
                                                            regex=False)
data['Country/Region'] = data['Country/Region'].str.replace('''Cote d'Ivoire''', 'Ivory Coast')
data['Country/Region'] = data['Country/Region'].str.replace('Czechia', 'Czech Republic')
data['Country/Region'] = data['Country/Region'].str.replace('Eswatini', 'Swaziland')
data['Country/Region'] = data['Country/Region'].str.replace('Holy See', 'Vatican City')
data['Country/Region'] = data['Country/Region'].str.replace('Korea, South', 'South Korea')
data['Country/Region'] = data['Country/Region'].str.replace('Taiwan*', 'Taiwan', regex=False)
data['Country/Region'] = data['Country/Region'].str.replace('US', 'United States')


Test to check that every country is accounted for at the population dataframe:

In [20]:
missing_countries = [country for country in data['Country/Region'].unique() 
                      if country not in population.name.unique()]
def test_countries():
    assert len(missing_countries) == 0, 'There are some countries not listed in population dataframe'
    return '✅ Everything OK'

test_countries()

'✅ Everything OK'

# Export data

In [21]:
data.to_csv('covid.csv', index=False)