# Data Preparation
## Merge Datasets

In [7]:
# import libraries and adjust settings
from datetime import date
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Render our plots inline
%matplotlib inline

# set figure size
plt.rcParams['figure.figsize'] = (15, 8)

# load datasets
df_CPI = pd.read_csv('data/preprocessed/exp_CountriesCPI.csv')
df_COVID = pd.read_csv('data/preprocessed/exp_keyCountriesCovid.csv')
df_UNEMPLOYMENT = pd.read_csv('data/preprocessed/exp_keyCountriesUnemployment.csv')
df_GDP = pd.read_csv('data/preprocessed/exp_worldGDP.csv')


### Previewing imported datasets

In [8]:
# Note: Just Germany
df_CPI.head()

Unnamed: 0,Year,Months,import_prices_index,Change on previous month in(%),Change on previous year's month in(%),export_prices_index,Change on previous month in(%).1,Change on previous year's month in(%).1,Consumer price index,Change on previous year's month in(%).2,Change on previous month in(%).2
0,2019,January,102.2,-0.2,0.8,102.2,0.1,1.1,103.4,1.4,-0.8
1,2019,February,102.5,0.3,1.6,102.3,0.1,1.3,103.8,1.5,0.4
2,2019,March,102.5,-,1.7,102.4,0.1,1.3,104.2,1.3,0.4
3,2019,April,102.8,0.3,1.4,102.6,0.2,1.3,105.2,2.0,1.0
4,2019,May,102.7,-0.1,-0.2,102.5,-0.1,0.7,105.4,1.4,0.2


In [60]:
df_COVID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       72 non-null     int64  
 1   location                   72 non-null     object 
 2   month                      72 non-null     int64  
 3   Monthly_new_cases          72 non-null     float64
 4   Percentage Growth Rate     71 non-null     float64
 5   PG_Rate                    71 non-null     float64
 6   Unemployment_Rate_Percent  0 non-null      float64
dtypes: float64(4), int64(2), object(1)
memory usage: 4.1+ KB


In [61]:
df_UNEMPLOYMENT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Month                      72 non-null     int64  
 1   Country                    72 non-null     object 
 2   Unemployment_Rate_Percent  67 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.8+ KB


In [13]:
df_GDP.head()

Unnamed: 0,TIME,GEO,GDP
0,2010Q3,European Union - 27 countries (from 2020),2737806.5
1,2010Q3,European Union - 28 countries (2013-2020),3219688.1
2,2010Q3,Euro area - 19 countries (from 2015),2372179.6
3,2010Q3,Belgium,88635.0
4,2010Q3,Bulgaria,10342.2


### Merge: covid <--> unemployment

In [70]:
# tables are joined on date and location, therefore check location description first
print (df_COVID.location.unique())
print (df_UNEMPLOYMENT.Country.unique())

# write out abbreviations UK and US in their full length within df_UNEMPLOYMENT
df_UNEMPLOYMENT['Country'] = df_UNEMPLOYMENT['Country'].replace({'USA':'United States'})
df_UNEMPLOYMENT['Country'] = df_UNEMPLOYMENT['Country'].replace({'UK':'United Kingdom'})

# create new dataframe which is supposed to hold merged data
df_MERGED = df_COVID
df_MERGED['Unemployment_Rate_Percent'] = np.NaN

# set values of new dataframes's column Unemployment_Rate_Percent 
# joining where year, location and month match
for srcIndex, srcRow in df_MERGED.iterrows():
    srcLocation = srcRow['location']
    srcMonth = srcRow['month']
    for tgIndex, tgRow in df_UNEMPLOYMENT.iterrows():
        if (srcLocation == tgRow['Country'] and srcMonth == tgRow['Month']):
           df_MERGED.loc[srcIndex,'Unemployment_Rate_Percent'] = tgRow['Unemployment_Rate_Percent']
df_MERGED



['China' 'Germany' 'India' 'Italy' 'United Kingdom' 'United States']
['Germany' 'India' 'China' 'Italy' 'United States' 'United Kingdom']


Unnamed: 0,year,location,month,Monthly_new_cases,Percentage Growth Rate,PG_Rate,Unemployment_Rate_Percent
0,2020,China,1,9254.0,,,5.3
1,2020,China,2,69554.0,651.610115,651.610115,6.2
2,2020,China,3,2923.0,-95.797510,-95.797510,5.9
3,2020,China,4,1677.0,-42.627438,-42.627438,6.0
4,2020,China,5,190.0,-88.670244,-88.670244,5.9
...,...,...,...,...,...,...,...
67,2020,United States,8,1458899.0,-24.200596,-24.200596,7.8
68,2020,United States,9,1206247.0,-17.317991,-17.317991,6.9
69,2020,United States,10,1926777.0,59.733206,59.733206,6.7
70,2020,United States,11,4496140.0,133.350305,133.350305,6.7
