In [1]:
import numpy as np
import pandas as pd

## HDR preprocessing

In [2]:
# import hdi data
hdi = pd.read_csv('raw/hdi_1990-2019.csv', 
                  encoding = 'latin1', 
                  na_values = '..'
).loc[:, ['Country', '2018']].rename(
    columns = {'2018': 'hdi'}
)

# import gdi data
gdi = pd.read_csv('raw/gdi_1995-2019.csv', 
                  encoding = 'latin1', 
                  na_values = '..'
).loc[:, ['Country', '2018']].rename(
    columns = {'2018': 'gdi'}
)

# merge hdi and ihdi and gdi
hdr_data = pd.merge(hdi, gdi, how = 'inner', on = 'Country')

# trim whitespace from country names
hdr_data['Country'] = hdr_data.Country.str.strip()

# preview
hdr_data.head()

Unnamed: 0,Country,hdi,gdi
0,Afghanistan,0.509,0.663
1,Albania,0.792,0.971
2,Algeria,0.746,0.86
3,Andorra,0.867,
4,Angola,0.582,0.903


## WDI preprocessing

In [3]:
# import world bank development indicators
wdi = pd.read_csv('raw/wdi-data.csv', na_values = '..')

# replace column names by variable codes
codenames = np.append(wdi.columns[0:4].values.tolist(), wdi.columns[4:].str.extract('.*\[(.*)\].*').values.tolist())
wdi.columns = codenames

# substitue short names for variable codes
varnames = pd.read_csv('raw/wdi-variablenames.csv')
code_dict = varnames.loc[:, ['Code', 'New name']].set_index('Code').transpose().to_dict('records')[0]
wdi = wdi.rename(columns = code_dict).set_index(['Time', 'Country Name'])

# preview
wdi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Time Code,Country Code,educ_bach_f,educ_sec_f,educ_psec_f,educ_prim_f,educ_tert_f,educ_upsec_f,educ_master_f,educ_phd_f,...,mortality_infant_f,mortality_infant_m,mortality_child_f,mortality_child_m,mortality_suicide_f,mortality_suicide_m,fertility_total,fertility_adolescent,contraceptive_any,contraceptive_modern
Time,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010,Afghanistan,YR2010,AFG,,,,,,,,,...,60.0,67.9,83.7,91.3,3.9,4.6,5.977,113.715,21.8,19.9
2010,Albania,YR2010,ALB,,,,,,,,,...,10.3,13.3,11.9,14.6,6.1,9.5,1.66,19.8208,,
2010,Algeria,YR2010,DZA,,,,,,,,,...,22.1,25.0,25.9,28.9,2.2,3.8,2.86,10.8084,,
2010,American Samoa,YR2010,ASM,,,,,,,,,...,,,,,,,,,,
2010,Andorra,YR2010,AND,,,,,,,,,...,3.8,4.6,4.1,5.1,,,1.27,,,


In [9]:
# non-gender-specific variables
variable_set1 = ['fertility_total', 'fertility_adolescent', 'gdp_percap', 'immunization_dpt', 'immunization_measles']

# variables to remove
variable_drop = ['pop_65up', 'pop_15to64', 'pop_0to14', 'mortality_infant_f', 'mortality_infant_m', 'mortality_child_f', 'mortality_maternal', 'mortality_child_m', 'mortality_suicide_f', 'mortality_suicide_m', 'contraceptive_any', 'educ_expected_yrs']

# separate gender-specific and non-gender specific variables
wdi_gender = wdi.drop(columns = np.append(variable_set1, variable_drop))
wdi_nongender = wdi.loc[:, variable_set1]

# average immunization rates
wdi_nongender['immunization'] = wdi_nongender.iloc[:, 3:5].mean(axis = 1)

# drop individual immunization rates, slice 2018 data, rename country
wdi_nongender = wdi_nongender.drop(
    columns = ['immunization_dpt', 'immunization_measles']
).loc[2018, :].reset_index().rename(
    columns = {'Country Name': 'Country'}
)

## Export datasets

In [10]:
# merge nongender data with hdr data
nongender_data = pd.merge(wdi_nongender, hdr_data, how = 'inner', on = 'Country').set_index('Country')

# separate
fertility_rates = nongender_data.iloc[:, 0:2].reset_index()
country_indicators = nongender_data.iloc[:, 2:6].reset_index()

# slice and reformat gender-specific variables
gender_data = wdi_gender.loc[2018].reset_index().rename(
    columns = {'Country Name': 'Country'}
).drop(columns = ['Time Code'])

In [11]:
gender_data.to_csv('gender-data.csv', index = False)
fertility_rates.to_csv('fertility.csv', index = False)
country_indicators.to_csv('country-indicators.csv', index = False)