# Merging the Data

### Reading in Data and Preparing to Merge

In [1]:
import pandas as pd
OECDdf = pd.read_csv('OECD_Clean.csv')
WBdf = pd.read_csv('WB_Clean.csv')
WEOdf = pd.read_csv('WEO_Clean.csv', na_values='--')
WITSdf = pd.read_csv('WITS_Clean.csv')

# Renaming columns so that they are an exact match
WBdf.rename({'country':'Country', 'year':'Year'}, axis=1, inplace=True)

In [2]:
# Fixing some country names in WEO data
update_dic = {'Hong Kong SAR':'Hong Kong',
              'Taiwan Province of China':'Taiwan, Province of China',
              'Czech Republic':'Czechia',
              'Democratic Republic of the Congo':'Congo, The Democratic Republic of the',
              'Islamic Republic of Iran':'Iran, Islamic Republic of',
              'Korea':'Korea, Republic of',
              'Micronesia':'Micronesia, Federated States of',
              'Moldova':'Moldova, Republic of',
              'Russia':'Russian Federation',
              'Slovak Republic':'Slovakia',
              'Tanzania':'Tanzania, United Republic of',
              'Venezuela':'Venezuela, Bolivarian Republic of',
              'Vietnam':'Viet Nam',
              'Kyrgyz Republic':'Kyrgyzstan',
              'Lao P.D.R.':"Lao People's Democratic Republic",
              'Macao SAR':'Macao',
              'Republic of Congo':'Congo',
              'São Tomé and Príncipe':'Sao Tome and Principe',
              'St. Kitts and Nevis':'Saint Kitts and Nevis',
              'St. Lucia':'Saint Lucia',
              'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines',
              'Syria':'Syrian Arab Republic',
              'The Bahamas':'Bahamas',
              'The Gambia':'Gambia'}
WEOdf = WEOdf.replace({"Country": update_dic})
WEOdf.sort_values(['Country', 'Year'], inplace=True)
WEOdf.reset_index(drop=True, inplace=True)

In [3]:
# Dropping GDP and population columns from WEO data, since we have that in WB data
WEOdf.drop(['NGDP_R', 'NGDP_RPCH', 'NGDP_FY', 'LP'], axis=1, inplace=True)

# Renaming some columns to be more readable
update_dic = {'PPPEX':'PPP_Conv_Rate',
             'PPPSH':'PPP_Share_GDP',
             'TM_RPCH':'Imports_PC',
             'TX_RPCH':'Exports_PC',
             'LUR':'Unemp_Rate',
             'LE':'Employment',
             'GGXONLB_NGDP':'Govt_Revenue'}
WEOdf.rename(update_dic, axis=1, inplace=True)

### Merging the Data

In [4]:
# I'm making the executive decision to just keep the countries
# that show up in all of the datasets
merge1 = pd.merge(left=WITSdf, right=OECDdf, how='inner', on=['Country', 'Year'])
merge2 = pd.merge(left=merge1, right=WEOdf, how='inner', on=['Country', 'Year'])
merged = pd.merge(left=merge2, right=WBdf, how='inner', on=['Country', 'Year'])

merged.head()

Unnamed: 0,Year,Country,Max_Partners,GDP_per_unit_CO2,PPP_Conv_Rate,PPP_Share_GDP,FLIBOR6,Imports_PC,Exports_PC,Unemp_Rate,...,agri_perc_gdp,agg.empl.agri.perc,rural.pop.perc,pop.tot,mobilesub_per100peeps,intl_tourist_arrival,total_life_exp,life_expectancy_fe,life_exp_male,trade_perGDP
0,1990,Albania,75,2.504851,2.117,0.035,,0.0,0.0,8.457,...,36.410703,55.914001,63.572,3286542.0,0.0,1062000.0,71.836,74.991,69.07,39.436963
1,1991,Albania,75,2.684573,2.775,0.024,,0.0,0.0,8.9,...,36.410703,55.914001,63.3,3266790.0,0.0,1062000.0,71.803,74.98,69.017,36.07052
2,1992,Albania,75,4.443426,9.488,0.02,,0.0,,26.5,...,36.410703,56.133999,62.751,3247039.0,0.0,1062000.0,71.802,74.985,68.997,108.785472
3,1993,Albania,75,5.26484,19.912,0.022,,0.0,0.0,22.3,...,36.410703,55.470001,62.201,3227287.0,0.0,1062000.0,71.86,75.039,69.037,80.518333
4,1994,Albania,75,5.542105,26.714,0.023,,0.0,0.0,18.4,...,36.410703,54.841,61.646,3207536.0,0.0,1062000.0,71.992,75.158,69.15,53.102585


### Cleaning the Data

In [5]:
merged.info()
# I'm going to drop FLIBOR6 and Employment, since they are missing so many.

# I also need to fix the column that is object instead of float
# due to the comma for the numbers >= 1000.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2700 entries, 0 to 2699
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   2700 non-null   int64  
 1   Country                2700 non-null   object 
 2   Max_Partners           2700 non-null   int64  
 3   GDP_per_unit_CO2       2700 non-null   float64
 4   PPP_Conv_Rate          2621 non-null   object 
 5   PPP_Share_GDP          2656 non-null   float64
 6   FLIBOR6                54 non-null     float64
 7   Imports_PC             2589 non-null   float64
 8   Exports_PC             2581 non-null   float64
 9   Unemp_Rate             2105 non-null   float64
 10  Employment             894 non-null    float64
 11  Govt_Revenue           2333 non-null   float64
 12  gdp_per_cap            2700 non-null   float64
 13  agri_perc_gdp          2700 non-null   float64
 14  agg.empl.agri.perc     2700 non-null   float64
 15  rura

In [6]:
# Dropping FLIBOR6 and LE
cleaned_df = merged.drop(['FLIBOR6', 'Employment'], axis=1)

# Fixing object columns
cleaned_df['PPP_Conv_Rate'] = cleaned_df['PPP_Conv_Rate'].str.replace(',', '').astype(float)

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2700 entries, 0 to 2699
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   2700 non-null   int64  
 1   Country                2700 non-null   object 
 2   Max_Partners           2700 non-null   int64  
 3   GDP_per_unit_CO2       2700 non-null   float64
 4   PPP_Conv_Rate          2621 non-null   float64
 5   PPP_Share_GDP          2656 non-null   float64
 6   Imports_PC             2589 non-null   float64
 7   Exports_PC             2581 non-null   float64
 8   Unemp_Rate             2105 non-null   float64
 9   Govt_Revenue           2333 non-null   float64
 10  gdp_per_cap            2700 non-null   float64
 11  agri_perc_gdp          2700 non-null   float64
 12  agg.empl.agri.perc     2700 non-null   float64
 13  rural.pop.perc         2700 non-null   float64
 14  pop.tot                2700 non-null   float64
 15  mobi

### Imputing Data

In [7]:
# I'm going to fill in the missing data (grouped by country) by
# defining a function based on interpolate.

# This function takes one column of the dataframe and fills
# in missing values by country (unless the country is missing all of them).
def impute_func(df, col):
    # List of countries to loop through
    countries = sorted(list(set(df['Country'])))
    
    # Initializing master df to add results to
    imputed_df = pd.DataFrame(columns=['Country', 'Year', col])

    for country in countries:
        # Set up dataframe for the one country
        country_df = df.loc[df['Country']==country, ['Country','Year',col]].copy()
        
        # Convert the Year column to DateTime format and set it as index
        # (requirement for interpolate)
        country_df['Year'] = pd.to_datetime(country_df['Year'], format='%Y')
        country_df = country_df.set_index('Year')
    
        # Use interpolate on the given column with the time method
        # to fill in missing values.
        country_df[col] = country_df[col].interpolate(method='time', limit_direction='both')
        
        # Change Year back to an int column
        country_df.reset_index(inplace=True)
        country_df['Year'] = country_df['Year'].dt.year
        
        # Add specific country df to the master df
        imputed_df = pd.concat([imputed_df, country_df], axis=0)
        
    return imputed_df

In [8]:
# Now I need to use that function on all the columns of the WB df

# List of columns other than country and year
cols = list(cleaned_df.columns[2:])

# Initializing master df to add the imputed columns to
imputed_df = cleaned_df[['Country', 'Year']].copy()

# For loop imputes each column and adds it to the master df
for col in cols:
    imputed_col = impute_func(cleaned_df, col)
    imputed_df = pd.merge(left=imputed_df, right=imputed_col, how='right',
                  on=['Country', 'Year'])


imputed_df.info()
# All columns are good except Unemp_Rate and Govt_Revenue

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2700 entries, 0 to 2699
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                2700 non-null   object 
 1   Year                   2700 non-null   object 
 2   Max_Partners           2700 non-null   object 
 3   GDP_per_unit_CO2       2700 non-null   float64
 4   PPP_Conv_Rate          2700 non-null   float64
 5   PPP_Share_GDP          2700 non-null   float64
 6   Imports_PC             2700 non-null   float64
 7   Exports_PC             2700 non-null   float64
 8   Unemp_Rate             2268 non-null   float64
 9   Govt_Revenue           2673 non-null   float64
 10  gdp_per_cap            2700 non-null   float64
 11  agri_perc_gdp          2700 non-null   float64
 12  agg.empl.agri.perc     2700 non-null   float64
 13  rural.pop.perc         2700 non-null   float64
 14  pop.tot                2700 non-null   float64
 15  mobi

In [9]:
# Let's see what countries are missing from Unemp_Rate
UR = imputed_df.loc[:,['Country', 'Year', 'Unemp_Rate']].copy()
UR = UR.dropna()
UR_countries = set(UR['Country'])
all_countries = set(imputed_df['Country'])
all_countries - UR_countries
# This column is missing significant countries, so
# I am going to drop it

{'Bangladesh',
 'Benin',
 'Cameroon',
 "Côte d'Ivoire",
 'Ethiopia',
 'Ghana',
 'Guatemala',
 'India',
 'Lebanon',
 'Mozambique',
 'Oman',
 'Senegal',
 'Tanzania, United Republic of',
 'Togo',
 'United Arab Emirates',
 'Zambia'}

In [10]:
imputed_df.drop('Unemp_Rate', axis=1, inplace=True)

In [11]:
# Let's see what countries are missing from Govt_Revenue
GR = imputed_df.loc[:,['Country', 'Year', 'Govt_Revenue']].copy()
GR = GR.dropna()
GR_countries = set(GR['Country'])
all_countries = set(imputed_df['Country'])
all_countries - GR_countries
# This is only missing one country, so I will remove Singapore
# (although, Govt_Revenue may not be useful--in that case
# we should just remove this column and leave Singapore in)

{'Singapore'}

In [12]:
# Removing Singapore
imputed_df.dropna(inplace=True)
imputed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2673 entries, 0 to 2699
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                2673 non-null   object 
 1   Year                   2673 non-null   object 
 2   Max_Partners           2673 non-null   object 
 3   GDP_per_unit_CO2       2673 non-null   float64
 4   PPP_Conv_Rate          2673 non-null   float64
 5   PPP_Share_GDP          2673 non-null   float64
 6   Imports_PC             2673 non-null   float64
 7   Exports_PC             2673 non-null   float64
 8   Govt_Revenue           2673 non-null   float64
 9   gdp_per_cap            2673 non-null   float64
 10  agri_perc_gdp          2673 non-null   float64
 11  agg.empl.agri.perc     2673 non-null   float64
 12  rural.pop.perc         2673 non-null   float64
 13  pop.tot                2673 non-null   float64
 14  mobilesub_per100peeps  2673 non-null   float64
 15  intl

In [13]:
# We have the years 1990-2016 and the following countries
# available for analysis
sorted(list(set(imputed_df['Country'])))

['Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia, Plurinational State of',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Cameroon',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czechia',
 "Côte d'Ivoire",
 'Denmark',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Korea, Republic of',
 'Kyrgyzstan',
 'Latvia',
 'Lebanon',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Malta',
 'Mauritius',
 'Mexico',
 'Moldova, Republic of',
 'Morocco',
 'Mozambique',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Nigeria',
 'North Macedonia',
 'Norway',
 'Oman',
 'Panama',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Port

### Writing to CSV

In [14]:
imputed_df.to_csv('merged.csv', index=False)