In [6]:
import pandas as pd
import numpy as np
import os
import pycountry

In [7]:
RAW_FOLDER = "raw_data"
OUTPUT_CSV = "final_panel_2010_2024.csv"
OUTPUT_XLSX = "final_panel_2010_2024.xlsx"

YEAR_START, YEAR_END = 2010, 2024

In [8]:
VARIABLE_NAMES = {
    "poverty_nat": "Poverty headcount ratio at national poverty lines",
    "poverty_83": "Poverty headcount ratio at 8.30",
    "emp_share": "Employment by industry",
    "food_price": "Food price index",
    "ag_va": "Gross Agri Production Value Added",
    "wage_food": "Wage in the food industry",
    "gdp_pc": "GDP per capita (current US$)",
    "gdp_growth": "GDP per capita growth (annual %)",
    "inflation": "Inflation, GDP deflator (annual %)",
    "enrollment": "Secondary enrollment",
    "transfers": "Subsidies and other transfers",
    "urban": "Urban population (% of total population)",
    "precip": "Average precipitation in depth (mm per year)",
    "lpi": "Logistic performance index",
    "region": "region",
    "income_class": "income_class"
}

In [27]:
poverty_nat = pd.read_excel('raw_data\\Poverty headcount ratio at national poverty lines.xls')

In [28]:
country_list = poverty_nat['Country Name'].unique()
print(f"Number of countries in master list: {len(country_list)}")
print("First 5 countries:", country_list[:5])


Number of countries in master list: 65
First 5 countries: ['Albania' 'Argentina' 'Armenia' 'Austria' 'Belgium']


In [292]:
poverty_nat.to_excel('preclean_data\\Poverty headcount ratio at national poverty lines.xlsx')

In [15]:
def filter_countries(df, country_column='Country Name'):
    """
    Filters a dataframe to keep only countries in the country_list
    """
    # Check if the specified country column exists
    if country_column not in df.columns:
        print(f"Warning: Column '{country_column}' not found. Available columns: {df.columns.tolist()}")
        return df
    
    # Filter the dataframe
    filtered_df = df[df[country_column].isin(country_list)].copy()
    
    print(f"Original shape: {df.shape}, Filtered shape: {filtered_df.shape}")
    print(f"Countries kept: {filtered_df[country_column].nunique()}")
    
    return filtered_df

In [94]:
poverty_83 = pd.read_excel('raw_data\\Poverty headcount ratio at 8.30.xlsx',
                           na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(poverty_83.columns.to_list())

['Series Name', 'Series Code', 'Country Name', 'Country Code', '2010 [YR2010]', '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]', '2021 [YR2021]', '2022 [YR2022]', '2023 [YR2023]', '2024 [YR2024]']


In [95]:
poverty_83 = filter_countries(poverty_83, 'Country Name')

Original shape: (271, 19), Filtered shape: (65, 19)
Countries kept: 65


In [96]:
poverty_83 = poverty_83.rename(columns={'2010 [YR2010]': '2010',
                           '2011 [YR2011]': '2011',
                           '2012 [YR2012]': '2012',
                           '2013 [YR2013]': '2013',
                           '2014 [YR2014]': '2014',
                           '2015 [YR2015]': '2015',
                           '2016 [YR2016]': '2016',
                           '2017 [YR2017]': '2017',
                           '2018 [YR2018]': '2018',
                           '2019 [YR2019]': '2019',
                           '2020 [YR2020]': '2020',
                           '2021 [YR2021]': '2021',
                           '2022 [YR2022]': '2022',
                           '2023 [YR2023]': '2023',
                           '2024 [YR2024]': '2024'})
poverty_83.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
1,Poverty headcount ratio at $8.30 a day (2021 P...,SI.POV.UMIC,Albania,ALB,,,45.9,,46.5,32.8,32.9,32.7,24.1,21.5,19.9,,,,
7,Poverty headcount ratio at $8.30 a day (2021 P...,SI.POV.UMIC,Argentina,ARG,13.6,11.3,10.3,10.4,11.7,,11.5,9.8,12.5,14.9,18.3,14.2,13.9,16.4,15.2
8,Poverty headcount ratio at $8.30 a day (2021 P...,SI.POV.UMIC,Armenia,ARM,66.0,63.0,59.6,57.4,55.2,50.7,45.9,52.7,52.8,57.1,58.2,57.0,55.7,56.9,
11,Poverty headcount ratio at $8.30 a day (2021 P...,SI.POV.UMIC,Austria,AUT,0.8,1.0,1.1,0.7,0.7,1.2,1.3,0.7,1.2,0.9,1.1,1.1,1.1,1.0,
17,Poverty headcount ratio at $8.30 a day (2021 P...,SI.POV.UMIC,Belarus,BLR,18.0,14.6,9.2,5.8,5.4,6.8,7.4,6.2,4.3,3.9,2.6,,,,


In [97]:
poverty_83.to_excel('preclean_data\\Poverty headcount ratio at 8.30.xlsx', index=False)

In [297]:
ag_va = pd.read_excel('raw_data\\Share of Food VA in GDP.xls', 
                      na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(ag_va.columns.to_list())

['Country Name', 'Indicator', 'Year', 'Value']


In [298]:
ag_va.head()

Unnamed: 0,Country Name,Indicator,Year,Value
0,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",2010,29.927071
1,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",2011,29.068881
2,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",2012,27.282895
3,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",2013,27.975367
4,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",2014,28.394494


In [299]:
ag_va = ag_va.pivot(index=['Country Name', 'Indicator'], columns='Year').reset_index()
ag_va.head()


Unnamed: 0_level_0,Country Name,Indicator,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",29.927071,29.068881,27.282895,27.975367,28.394494,24.240142,26.525034,26.991481,26.587779,30.606768,32.618118,37.12771,36.999484,36.765292,36.048834
1,Albania,"Agriculture Share of GDP US$, 2015 prices",18.838472,19.247593,20.00197,20.028236,20.065053,19.780173,19.537605,18.979104,18.460519,18.196361,19.071596,17.826641,17.01049,16.332572,16.362968
2,Algeria,"Agriculture Share of GDP US$, 2015 prices",9.692806,10.512314,10.898647,11.471144,11.327478,11.578715,11.421639,11.342714,11.600503,11.659044,11.900684,11.066839,11.231845,11.109666,11.078726
3,Andorra,"Agriculture Share of GDP US$, 2015 prices",0.466561,0.490616,0.601234,0.585626,0.537684,0.51342,0.521889,0.559619,0.54667,0.522782,0.559559,0.531902,0.487599,0.478778,0.483182
4,Angola,"Agriculture Share of GDP US$, 2015 prices",7.517585,7.839903,7.733747,7.559591,8.359599,9.122534,9.722951,9.780066,9.754476,9.688445,10.470594,12.075625,12.185854,12.382942,12.445733


In [300]:
ag_va.columns

MultiIndex([('Country Name',   ''),
            (   'Indicator',   ''),
            (       'Value', 2010),
            (       'Value', 2011),
            (       'Value', 2012),
            (       'Value', 2013),
            (       'Value', 2014),
            (       'Value', 2015),
            (       'Value', 2016),
            (       'Value', 2017),
            (       'Value', 2018),
            (       'Value', 2019),
            (       'Value', 2020),
            (       'Value', 2021),
            (       'Value', 2022),
            (       'Value', 2023),
            (       'Value', 2024)],
           names=[None, 'Year'])

In [302]:
new_columns = ['Country Name', 'Indicator', '2010', '2011', '2012', '2013', '2014', '2015',
               '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']

In [303]:
ag_va.columns = new_columns

In [304]:
ag_va.head()

Unnamed: 0,Country Name,Indicator,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,"Agriculture Share of GDP US$, 2015 prices",29.927071,29.068881,27.282895,27.975367,28.394494,24.240142,26.525034,26.991481,26.587779,30.606768,32.618118,37.12771,36.999484,36.765292,36.048834
1,Albania,"Agriculture Share of GDP US$, 2015 prices",18.838472,19.247593,20.00197,20.028236,20.065053,19.780173,19.537605,18.979104,18.460519,18.196361,19.071596,17.826641,17.01049,16.332572,16.362968
2,Algeria,"Agriculture Share of GDP US$, 2015 prices",9.692806,10.512314,10.898647,11.471144,11.327478,11.578715,11.421639,11.342714,11.600503,11.659044,11.900684,11.066839,11.231845,11.109666,11.078726
3,Andorra,"Agriculture Share of GDP US$, 2015 prices",0.466561,0.490616,0.601234,0.585626,0.537684,0.51342,0.521889,0.559619,0.54667,0.522782,0.559559,0.531902,0.487599,0.478778,0.483182
4,Angola,"Agriculture Share of GDP US$, 2015 prices",7.517585,7.839903,7.733747,7.559591,8.359599,9.122534,9.722951,9.780066,9.754476,9.688445,10.470594,12.075625,12.185854,12.382942,12.445733


In [306]:
ag_va.to_excel('preclean_data\\Share of Food VA in GDP.xlsx', index=False)

In [154]:
wage_food = pd.read_excel('raw_data\\Wage in the food industry.xlsx',
                          na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(wage_food.columns. to_list())

['ref_area.label', 'source.label', 'indicator.label', 'sex.label', 'classif1.label', 'classif2.label', 'time', 'obs_value', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label']


In [155]:
wage_food.head()

Unnamed: 0,ref_area.label,source.label,indicator.label,sex.label,classif1.label,classif2.label,time,obs_value,obs_status.label,note_classif.label,note_indicator.label,note_source.label
0,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Total,Currency: Local currency,2010,3013.0,,,Job coverage: Main job currently held | Workin...,Data reference period: September
1,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Total,Currency: 2021 PPP $,2010,1860.382,,,Job coverage: Main job currently held | Workin...,Data reference period: September
2,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Total,Currency: U.S. dollars,2010,1683.24,,,Job coverage: Main job currently held | Workin...,Data reference period: September
3,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: Local currency,2010,2187.0,,,Job coverage: Main job currently held | Workin...,Data reference period: September
4,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: 2021 PPP $,2010,1350.367,,,Job coverage: Main job currently held | Workin...,Data reference period: September


In [156]:
wage_food = wage_food[wage_food['sex.label'] == 'Total']

In [157]:
wage_food = wage_food[wage_food['classif1.label'] == 'Economic activity (Broad sector): Agriculture']

In [158]:
wage_food = wage_food[wage_food['classif2.label'] == 'Currency: U.S. dollars']
wage_food.head()

Unnamed: 0,ref_area.label,source.label,indicator.label,sex.label,classif1.label,classif2.label,time,obs_value,obs_status.label,note_classif.label,note_indicator.label,note_source.label
5,Aruba,PC - Population and Housing Census,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2010,1221.788,,,Job coverage: Main job currently held | Workin...,Data reference period: September
245,Afghanistan,LFS - Labour Force Survey,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2020,134.013,Break in series,,Currency: AFG - Afghani (AFN) | Break in serie...,Repository: ILO-STATISTICS - Micro data proces...
344,Afghanistan,HIES - Households Living Conditions Survey,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2014,94.035,,,Currency: AFG - Afghani (AFN),Repository: ILO-STATISTICS - Micro data proces...
593,Angola,LFS - Employment Survey,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2021,48.2,,,Currency: AGO - Kwanza (AOA),Repository: ILO-STATISTICS - Micro data proces...
854,Angola,LFS - Employment Survey,Average monthly earnings of employees by sex a...,Total,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2019,83.328,Break in series,,Currency: AGO - Kwanza (AOA) | Break in series...,Repository: ILO-STATISTICS - Micro data proces...


In [159]:
wage_food = wage_food[['ref_area.label', 'indicator.label', 'classif1.label', 'classif2.label', 'time', 'obs_value']]

In [160]:
wage_food = wage_food.rename(columns={'ref_area.label': 'Country Name',
                          'indicator.label': 'Indicator',            
                          'classif1.label': 'Sector',
                          'classif2.label': 'Currency',
                          'time': 'date',
                          'obs_value': 'value'})
wage_food.head()

Unnamed: 0,Country Name,Indicator,Sector,Currency,date,value
5,Aruba,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2010,1221.788
245,Afghanistan,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2020,134.013
344,Afghanistan,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2014,94.035
593,Angola,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2021,48.2
854,Angola,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,2019,83.328


In [161]:
wage_food.to_excel('preclean_data\\Wage in the food industry_long.xlsx', index=False)

In [162]:
wage_food = wage_food.pivot(index=['Country Name', 'Indicator', 'Sector', 'Currency'], columns='date')

In [163]:
wage_food = wage_food.reset_index()
wage_food.columns

MultiIndex([('Country Name',   ''),
            (   'Indicator',   ''),
            (      'Sector',   ''),
            (    'Currency',   ''),
            (       'value', 1990),
            (       'value', 1991),
            (       'value', 1992),
            (       'value', 1993),
            (       'value', 1994),
            (       'value', 1995),
            (       'value', 1996),
            (       'value', 1997),
            (       'value', 1998),
            (       'value', 1999),
            (       'value', 2000),
            (       'value', 2001),
            (       'value', 2002),
            (       'value', 2003),
            (       'value', 2004),
            (       'value', 2005),
            (       'value', 2006),
            (       'value', 2007),
            (       'value', 2008),
            (       'value', 2009),
            (       'value', 2010),
            (       'value', 2011),
            (       'value', 2012),
            (       'value',

In [164]:
# Create new column names
new_columns = []
for col in wage_food.columns:
    if col[0] in ['Country Name', 'Indicator', 'Sector', 'Currency']:
        # Keep the names from the first level
        new_columns.append(col[0])
    else:
        # For the 'value' columns, use the year (second level)
        new_columns.append(str(col[1]))  # Convert year to string

In [165]:
wage_food.columns = new_columns

In [166]:
wage_food.head()

Unnamed: 0,Country Name,Indicator,Sector,Currency,1990,1991,1992,1993,1994,1995,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,,,,,,,...,,,,,,134.013,,,,
1,Albania,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,,,,,,,...,268.336,274.379,304.487,341.766,336.01,307.032,344.484,358.549,,
2,Angola,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,,,,,,,...,,,,,83.328,,48.2,,,
3,Antigua and Barbuda,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,,,,,,,...,,,,477.793,,,,,,
4,Argentina,Average monthly earnings of employees by sex a...,Economic activity (Broad sector): Agriculture,Currency: U.S. dollars,,,,,,,...,,,839.116,569.864,549.519,451.296,468.7,577.779,633.377,657.868


In [167]:
wage_food.to_excel('preclean_data\\Wage in the food industry_wide.xlsx', index=False)

In [199]:
emp_share = pd.read_excel('raw_data\\Employment by industry.xlsx',
                          na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(emp_share.columns.to_list())

['ref_area.label', 'indicator.label', 'sex.label', 'classif1.label', 'time', 'obs_value', 'obs_status.label', 'note_classif.label', 'note_indicator.label', 'note_source.label']


In [200]:
emp_share.head()

Unnamed: 0,ref_area.label,indicator.label,sex.label,classif1.label,time,obs_value,obs_status.label,note_classif.label,note_indicator.label,note_source.label
0,Aruba,Employment by sex and economic activity (thous...,Total,Economic activity (Broad sector): Total,2011,47.915,,,,
1,Aruba,Employment by sex and economic activity (thous...,Total,Economic activity (Broad sector): Agriculture,2011,0.286,,,,
2,Aruba,Employment by sex and economic activity (thous...,Total,Economic activity (Broad sector): Non-agriculture,2011,47.629,,,,
3,Aruba,Employment by sex and economic activity (thous...,Total,Economic activity (Broad sector): Industry,2011,6.728,,,,
4,Aruba,Employment by sex and economic activity (thous...,Total,Economic activity (Broad sector): Services,2011,40.763,,,,


In [201]:
emp_share.shape

(398498, 10)

In [202]:
emp_share = emp_share[emp_share['sex.label'] == 'Total']
emp_share.shape

(140395, 10)

In [203]:
emp_share = emp_share[(emp_share['classif1.label'] == 'Economic activity (Broad sector): Total') | (emp_share['classif1.label'] == 'Economic activity (Broad sector): Agriculture')]
emp_share.shape

(9307, 10)

In [204]:
emp_share = emp_share[['ref_area.label', 'indicator.label', 'classif1.label', 'time', 'obs_value']]

In [205]:
emp_share.head()

Unnamed: 0,ref_area.label,indicator.label,classif1.label,time,obs_value
0,Aruba,Employment by sex and economic activity (thous...,Economic activity (Broad sector): Total,2011,47.915
1,Aruba,Employment by sex and economic activity (thous...,Economic activity (Broad sector): Agriculture,2011,0.286
90,Aruba,Employment by sex and economic activity (thous...,Economic activity (Broad sector): Total,2010,46.526
91,Aruba,Employment by sex and economic activity (thous...,Economic activity (Broad sector): Agriculture,2010,0.296
189,Aruba,Employment by sex and economic activity (thous...,Economic activity (Broad sector): Total,2007,51.607


In [206]:
emp_share = emp_share.pivot(index=['ref_area.label', 'indicator.label', 'time'], columns='classif1.label').reset_index()
emp_share.head()

Unnamed: 0_level_0,ref_area.label,indicator.label,time,obs_value,obs_value
classif1.label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Economic activity (Broad sector): Agriculture,Economic activity (Broad sector): Total
0,Afghanistan,Employment by sex and economic activity (thous...,2008,4426.731,7486.564
1,Afghanistan,Employment by sex and economic activity (thous...,2012,2477.079,6419.616
2,Afghanistan,Employment by sex and economic activity (thous...,2014,2824.353,7003.038
3,Afghanistan,Employment by sex and economic activity (thous...,2017,2740.235,6396.534
4,Afghanistan,Employment by sex and economic activity (thous...,2020,2714.301,6078.509


In [208]:
# Create new column names
new_columns = ['ref_area.label', 'indicator.label', 'time', 'Economic activity (Broad sector): Agriculture', 'Economic activity (Broad sector): Total']

In [209]:
emp_share.columns = new_columns

In [210]:
emp_share['emp_share'] = emp_share['Economic activity (Broad sector): Agriculture']/emp_share['Economic activity (Broad sector): Total']

In [211]:
emp_share = emp_share[['ref_area.label', 'indicator.label', 'time', 'emp_share']]
emp_share = emp_share.rename(columns={'ref_area.label': 'Country Name',
                                      'indicator.label': 'Indicator',
                                      'time': 'date'})

In [212]:
emp_share = emp_share.pivot(index=['Country Name', 'Indicator'], columns='date').reset_index()
emp_share.columns

MultiIndex([('Country Name',   ''),
            (   'Indicator',   ''),
            (   'emp_share', 1947),
            (   'emp_share', 1948),
            (   'emp_share', 1949),
            (   'emp_share', 1950),
            (   'emp_share', 1951),
            (   'emp_share', 1952),
            (   'emp_share', 1953),
            (   'emp_share', 1954),
            (   'emp_share', 1955),
            (   'emp_share', 1956),
            (   'emp_share', 1957),
            (   'emp_share', 1958),
            (   'emp_share', 1959),
            (   'emp_share', 1960),
            (   'emp_share', 1961),
            (   'emp_share', 1962),
            (   'emp_share', 1963),
            (   'emp_share', 1964),
            (   'emp_share', 1965),
            (   'emp_share', 1966),
            (   'emp_share', 1967),
            (   'emp_share', 1968),
            (   'emp_share', 1969),
            (   'emp_share', 1970),
            (   'emp_share', 1971),
            (   'emp_share',

In [216]:
emp_share.columns = ['Country Name', 'Indicator'] + [str(year) for year in range(1947, 2025)]

In [217]:
emp_share.head()

Unnamed: 0,Country Name,Indicator,1947,1948,1949,1950,1951,1952,1953,1954,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,Employment by sex and economic activity (thous...,,,,,,,,,...,,,0.428394,,,0.446541,0.487952,,,
1,Albania,Employment by sex and economic activity (thous...,,,,,,,,,...,0.411782,0.400308,0.380772,0.372858,0.364169,0.360409,0.337481,0.339269,0.322701,
2,Algeria,Employment by sex and economic activity (thous...,,,,,,,,,...,0.086558,0.083441,0.101496,,,,,,,
3,American Samoa,Employment by sex and economic activity (thous...,,,,,,,,,...,,,,,,,,,,
4,Angola,Employment by sex and economic activity (thous...,,,,,,,,,...,,,,,0.518941,,0.557627,0.514927,,


In [218]:
emp_share.to_excel('preclean_data\\Employment by industry.xlsx', index=False)

In [219]:
food_price = pd.read_excel('raw_data\\Food price index.xls',
                           na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(food_price.columns.to_list())

['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', '2010 [2010]', '2010 [2010].1', '2010 [2010].2', '2011 [2011]', '2011 [2011].1', '2011 [2011].2', '2012 [2012]', '2012 [2012].1', '2012 [2012].2', '2013 [2013]', '2013 [2013].1', '2013 [2013].2', '2014 [2014]', '2014 [2014].1', '2014 [2014].2', '2015 [2015]', '2015 [2015].1', '2015 [2015].2', '2016 [2016]', '2016 [2016].1', '2016 [2016].2', '2017 [2017]', '2017 [2017].1', '2017 [2017].2', '2018 [2018]', '2018 [2018].1', '2018 [2018].2', '2019 [2019]', '2019 [2019].1', '2019 [2019].2', '2020 [2020]', '2020 [2020].1', '2020 [2020].2', '2021 [2021]', '2021 [2021].1', '2021 [2021].2', '2022 [2022]', '2022 [2022].1', '2022 [2022].2', '2023 [2023]', '2023 [2023].1', '2023 [2023].2', '2024 [2024]', '2024 [2024].1', '2024 [2024].2']


In [220]:
food_price.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,2010 [2010],2010 [2010].1,2010 [2010].2,2011 [2011],2011 [2011].1,2011 [2011].2,2012 [2012],...,2021 [2021].2,2022 [2022],2022 [2022].1,2022 [2022].2,2023 [2023],2023 [2023].1,2023 [2023].2,2024 [2024],2024 [2024].1,2024 [2024].2
0,,,,Unit,Value,Flag Description,Unit,Value,Flag Description,Unit,...,Flag Description,Unit,Value,Flag Description,Unit,Value,Flag Description,Unit,Value,Flag Description
1,Afghanistan [004],"Value US$, 2015 prices [6179]",GDP Deflator [22024],USD,101.368737,Figure from external organization,USD,111.212507,Figure from external organization,USD,...,Figure from external organization,USD,88.323325,Figure from external organization,USD,99.591629,Figure from external organization,USD,121.46049,Estimated value
2,Afghanistan [004],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",USD,100.491475,Figure from external organization,USD,111.233985,Figure from external organization,USD,...,Figure from external organization,USD,80.450313,Figure from external organization,USD,93.511802,Figure from external organization,USD,118.894029,Estimated value
3,Albania [008],"Value US$, 2015 prices [6179]",GDP Deflator [22024],USD,114.460966,Figure from external organization,USD,120.639493,Figure from external organization,USD,...,Figure from external organization,USD,132.098754,Figure from external organization,USD,155.124994,Figure from external organization,USD,174.708899,Estimated value
4,Albania [008],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",USD,109.09877,Figure from external organization,USD,114.241683,Figure from external organization,USD,...,Figure from external organization,USD,144.619673,Figure from external organization,USD,173.49786,Figure from external organization,USD,196.280088,Estimated value


In [221]:
food_price = food_price.rename(columns={'Unnamed: 0': 'Country Name',
                      'Unnamed: 1': 'Currency',
                      'Unnamed: 2': 'Indicator',
                      '2010 [2010].1': '2010',
                      '2011 [2011].1': '2011',
                      '2012 [2012].1': '2012',
                      '2013 [2013].1': '2013',
                      '2014 [2014].1': '2014',
                      '2015 [2015].1': '2015',
                      '2016 [2016].1': '2016',
                      '2017 [2017].1': '2017',
                      '2018 [2018].1': '2018',
                      '2019 [2019].1': '2019',
                      '2020 [2020].1': '2020',
                      '2021 [2021].1': '2021',
                      '2022 [2022].1': '2022',
                      '2023 [2023].1': '2023',
                      '2024 [2024].1': '2024'})
food_price.head()

Unnamed: 0,Country Name,Currency,Indicator,2010 [2010],2010,2010 [2010].2,2011 [2011],2011,2011 [2011].2,2012 [2012],...,2021 [2021].2,2022 [2022],2022,2022 [2022].2,2023 [2023],2023,2023 [2023].2,2024 [2024],2024,2024 [2024].2
0,,,,Unit,Value,Flag Description,Unit,Value,Flag Description,Unit,...,Flag Description,Unit,Value,Flag Description,Unit,Value,Flag Description,Unit,Value,Flag Description
1,Afghanistan [004],"Value US$, 2015 prices [6179]",GDP Deflator [22024],USD,101.368737,Figure from external organization,USD,111.212507,Figure from external organization,USD,...,Figure from external organization,USD,88.323325,Figure from external organization,USD,99.591629,Figure from external organization,USD,121.46049,Estimated value
2,Afghanistan [004],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",USD,100.491475,Figure from external organization,USD,111.233985,Figure from external organization,USD,...,Figure from external organization,USD,80.450313,Figure from external organization,USD,93.511802,Figure from external organization,USD,118.894029,Estimated value
3,Albania [008],"Value US$, 2015 prices [6179]",GDP Deflator [22024],USD,114.460966,Figure from external organization,USD,120.639493,Figure from external organization,USD,...,Figure from external organization,USD,132.098754,Figure from external organization,USD,155.124994,Figure from external organization,USD,174.708899,Estimated value
4,Albania [008],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",USD,109.09877,Figure from external organization,USD,114.241683,Figure from external organization,USD,...,Figure from external organization,USD,144.619673,Figure from external organization,USD,173.49786,Figure from external organization,USD,196.280088,Estimated value


In [222]:
food_price = food_price[['Country Name', 'Currency', 'Indicator', '2010', 
               '2011', '2012', '2013', '2014', '2015', '2016', '2017',
               '2018', '2019', '2020', '2021', '2022', '2023', '2024']]

In [225]:
food_price = food_price.drop(0)
food_price.head()

Unnamed: 0,Country Name,Currency,Indicator,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
1,Afghanistan [004],"Value US$, 2015 prices [6179]",GDP Deflator [22024],101.368737,111.212507,111.552999,106.141015,101.652308,100,94.815368,97.314846,94.70049,87.855273,92.531648,88.571653,88.323325,99.591629,121.46049
2,Afghanistan [004],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",100.491475,111.233985,112.840986,102.590772,95.261805,100,92.571637,91.659138,81.70547,74.507451,85.035317,80.150291,80.450313,93.511802,118.894029
3,Albania [008],"Value US$, 2015 prices [6179]",GDP Deflator [22024],114.460966,120.639493,113.684884,116.726131,118.747357,100,100.823802,106.617541,119.319379,118.771659,120.920822,131.29549,132.098754,155.124994,174.708899
4,Albania [008],"Value US$, 2015 prices [6179]","Value Added Deflator (Agriculture, forestry an...",109.09877,114.241683,106.664661,114.02753,118.304682,100,102.435818,106.859006,119.118827,120.031253,121.894875,135.209948,144.619673,173.49786,196.280088
5,Algeria [012],"Value US$, 2015 prices [6179]",GDP Deflator [22024],114.350063,137.882068,139.374979,136.03009,133.583641,100,93.428564,98.028877,99.607893,96.845241,86.592278,93.932063,109.829683,115.836076,125.14894


In [227]:
food_price = pd.melt(food_price, id_vars=['Country Name', 'Currency', 'Indicator'], var_name='year', value_name='value')

In [230]:
food_price = food_price.pivot(index=['Country Name', 'Currency', 'year'], columns='Indicator').reset_index()
food_price.columns

MultiIndex([('Country Name', ...),
            (    'Currency', ...),
            (        'year', ...),
            (       'value', ...),
            (       'value', ...)],
           names=[None, 'Indicator'])

In [231]:
new_columns = ['Country Name', 'Currency', 'Year', 'GDP Deflator', 'Agri Deflator']

In [232]:
food_price.columns = new_columns

In [234]:
food_price['Food Price'] = food_price['Agri Deflator']/food_price['GDP Deflator']

In [236]:
food_price = food_price[['Country Name', 'Year', 'Food Price']]

In [238]:
food_price= food_price.pivot(index='Country Name', columns='Year').reset_index()
food_price.columns

MultiIndex([('Country Name',     ''),
            (  'Food Price', '2010'),
            (  'Food Price', '2011'),
            (  'Food Price', '2012'),
            (  'Food Price', '2013'),
            (  'Food Price', '2014'),
            (  'Food Price', '2015'),
            (  'Food Price', '2016'),
            (  'Food Price', '2017'),
            (  'Food Price', '2018'),
            (  'Food Price', '2019'),
            (  'Food Price', '2020'),
            (  'Food Price', '2021'),
            (  'Food Price', '2022'),
            (  'Food Price', '2023'),
            (  'Food Price', '2024')],
           names=[None, 'Year'])

In [239]:
new_columns = ['Country Name'] + [str(year) for year in range(2010, 2025)]
food_price.columns = new_columns

In [241]:
food_price['Indicator'] = 'Relative price of staple foods'

In [243]:
food_price = food_price[['Country Name', 'Indicator', '2010', '2021', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', 
                         '2020', '2021', '2022', '2023', '2024']]
food_price.head()

Unnamed: 0,Country Name,Indicator,2010,2021,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021.1,2022,2023,2024
0,Afghanistan [004],Relative price of staple foods,0.991346,0.90492,1.011546,0.966552,0.937134,1.0,0.976336,0.941882,0.862778,0.84807,0.918986,0.90492,0.910861,0.938952,0.97887
1,Albania [008],Relative price of staple foods,0.953153,1.029814,0.938248,0.976881,0.996272,1.0,1.015988,1.002265,0.998319,1.010605,1.008055,1.029814,1.094785,1.118439,1.123469
2,Algeria [012],Relative price of staple foods,0.873477,1.015234,0.80475,0.858774,0.908093,1.0,1.069907,1.036427,1.023594,1.045861,1.090423,1.015234,0.944918,1.177964,1.165327
3,Andorra [020],Relative price of staple foods,0.999887,0.999894,0.999887,0.999887,0.999887,1.0,1.000211,0.999857,0.999447,0.999714,0.999793,0.999894,0.999781,0.999403,0.98544
4,Angola [024],Relative price of staple foods,0.822112,0.959703,0.784824,0.860826,0.902801,1.0,1.01113,1.024226,0.88244,0.813611,0.94995,0.959703,1.127355,1.202726,1.061243


In [244]:
food_price['Country Name'] = food_price['Country Name'].str.replace(r'\s*\[\d+\]\s*', '', regex=True)
food_price.head()

Unnamed: 0,Country Name,Indicator,2010,2021,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021.1,2022,2023,2024
0,Afghanistan,Relative price of staple foods,0.991346,0.90492,1.011546,0.966552,0.937134,1.0,0.976336,0.941882,0.862778,0.84807,0.918986,0.90492,0.910861,0.938952,0.97887
1,Albania,Relative price of staple foods,0.953153,1.029814,0.938248,0.976881,0.996272,1.0,1.015988,1.002265,0.998319,1.010605,1.008055,1.029814,1.094785,1.118439,1.123469
2,Algeria,Relative price of staple foods,0.873477,1.015234,0.80475,0.858774,0.908093,1.0,1.069907,1.036427,1.023594,1.045861,1.090423,1.015234,0.944918,1.177964,1.165327
3,Andorra,Relative price of staple foods,0.999887,0.999894,0.999887,0.999887,0.999887,1.0,1.000211,0.999857,0.999447,0.999714,0.999793,0.999894,0.999781,0.999403,0.98544
4,Angola,Relative price of staple foods,0.822112,0.959703,0.784824,0.860826,0.902801,1.0,1.01113,1.024226,0.88244,0.813611,0.94995,0.959703,1.127355,1.202726,1.061243


In [245]:
food_price.to_excel('preclean_data\\Food price index.xlsx', index=False)

In [246]:
gdp_pc = pd.read_excel('raw_data\\GDP per capita (current US$).xls',
                       na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(gdp_pc.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [247]:
gdp_pc.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Aruba,ABW,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,27458.225331,27441.529662,28440.051964,30082.127645,31096.205074,22855.93232,27200.061079,30559.533535,33984.79062,
1,Africa Eastern and Southern,AFE,GDP per capita (current US$),NY.GDP.PCAP.CD,186.121835,186.941781,197.402402,225.440494,208.999748,226.876513,...,1479.61526,1329.807285,1520.212231,1538.901679,1493.817938,1344.10321,1522.393346,1628.318944,1510.742951,1567.635839
2,Afghanistan,AFG,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,565.56973,522.082216,525.469771,491.337221,496.602504,510.787063,356.496214,357.261153,413.757895,
3,Africa Western and Central,AFW,GDP per capita (current US$),NY.GDP.PCAP.CD,121.939925,127.454189,133.827044,139.008291,148.549379,155.565216,...,1860.727694,1630.039447,1574.23056,1720.14028,1798.340685,1680.039332,1765.954788,1796.668633,1599.392983,1284.154441
4,Angola,AGO,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,3213.902611,1807.952941,2437.259712,2538.591391,2189.855714,1449.922867,1925.874661,2929.694455,2309.53413,2122.08369


In [248]:
gdp_pc = filter_countries(gdp_pc, 'Country Name')
gdp_pc.head()

Original shape: (266, 69), Filtered shape: (65, 69)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,3981.726623,4168.375445,4614.047969,5365.488802,5460.430509,5370.7775,6413.282808,6846.426694,8575.171346,10011.627986
9,Argentina,ARG,GDP per capita (current US$),NY.GDP.PCAP.CD,778.251707,971.338043,870.217491,852.972425,1176.200862,1281.83338,...,13679.626498,12699.962314,14532.500931,11752.799892,9955.974787,8535.59938,10738.017922,13935.681111,14187.482725,13858.20398
10,Armenia,ARM,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,3512.393503,3524.424769,3869.053739,4196.005623,4597.228874,4268.680933,4685.179971,6571.974455,8125.273958,8500.605225
14,Austria,AUT,GDP per capita (current US$),NY.GDP.PCAP.CD,939.914815,1036.728204,1093.014191,1172.557446,1275.457153,1381.077258,...,43915.228021,45061.499392,47163.742578,51194.074984,49885.994736,48716.40989,53648.719074,52176.664914,56033.573792,56833.196047
17,Belgium,BEL,GDP per capita (current US$),NY.GDP.PCAP.CD,1290.286072,1367.788852,1457.26517,1555.022932,1724.018942,1859.509987,...,40893.804538,41854.54983,44035.323936,47487.210039,46716.622747,45906.287581,51658.238295,50822.251854,54690.094273,55954.610626


In [250]:
gdp_pc.to_excel('preclean_data\\GDP per capita (current US$).xlsx', index=False)

In [None]:
gdp_growth = pd.read_excel('raw_data\\GDP per capita growth (annual %).xls',
                           na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(gdp_growth.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [252]:
gdp_growth = filter_countries(gdp_growth, 'Country Name')
gdp_growth.head()

Original shape: (266, 69), Filtered shape: (65, 69)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,...,2.525831,4.075198,3.378212,3.927526,2.498299,-2.75698,9.984331,6.108948,5.137124,5.162522
9,Argentina,ARG,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,3.697198,-2.481212,-6.851643,8.371062,8.853456,...,1.660913,-3.024501,1.916396,-3.415537,-2.695067,-10.33611,10.14856,5.048195,-1.89295,-2.058403
10,Armenia,ARM,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,...,3.515995,0.611877,7.965446,5.568501,7.836084,-7.168665,5.771428,12.338334,8.47902,3.484216
14,Austria,AUT,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,4.960717,2.02147,3.471466,5.419336,2.810013,...,0.17426,1.018877,1.564307,1.986263,1.303503,-6.706394,4.339758,4.275933,-1.930146,-1.676594
17,Belgium,BEL,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,4.630257,4.794035,3.574354,5.949139,2.624445,...,0.887876,0.682455,1.084312,1.415289,1.890719,-5.20244,5.771346,3.420168,0.281208,0.258325


In [253]:
gdp_growth.to_excel('preclean_data\\GDP per capita growth (annual %).xlsx', index=False)

In [None]:
inflation = pd.read_excel('raw_data\\Inflation, GDP deflator (annual %).xls',
                          na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(inflation.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [255]:
inflation = filter_countries(inflation, 'Country Name')
inflation.head()

Original shape: (266, 69), Filtered shape: (65, 69)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,"Inflation, GDP deflator (annual %)",NY.GDP.DEFL.KD.ZG,,,,,,,...,0.771576,-0.864265,2.725069,1.453287,1.000633,0.041044,3.444391,9.861583,6.066052,2.688874
9,Argentina,ARG,"Inflation, GDP deflator (annual %)",NY.GDP.DEFL.KD.ZG,,20.310698,28.871842,25.591154,28.774617,21.232935,...,26.579992,41.11938,26.006379,42.033669,49.195579,40.083088,53.802088,69.876084,135.368876,207.921861
10,Armenia,ARM,"Inflation, GDP deflator (annual %)",NY.GDP.DEFL.KD.ZG,,,,,,,...,1.21391,0.268575,2.150642,2.78771,1.065623,1.806593,6.900514,7.985879,2.673004,1.161269
14,Austria,AUT,"Inflation, GDP deflator (annual %)",NY.GDP.DEFL.KD.ZG,,5.087154,3.340203,3.678266,3.183788,5.321415,...,2.292635,1.814751,0.975318,1.81048,1.473685,2.593133,1.926129,4.754778,6.647654,3.050594
17,Belgium,BEL,"Inflation, GDP deflator (annual %)",NY.GDP.DEFL.KD.ZG,,1.315469,1.667692,3.025797,4.642427,5.100713,...,1.121839,1.895289,1.983163,1.717282,1.854241,1.596117,2.742627,6.843976,4.508122,1.943705


In [256]:
inflation.to_excel('preclean_data\\Inflation, GDP deflator (annual %).xlsx', index=False)

In [262]:
enrollment = pd.read_excel('raw_data\\Secondary enrollment.xlsx',
                            na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(enrollment.columns.to_list())

['Series Name', 'Series Code', 'Country Name', 'Country Code', '2010 [YR2010]', '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]', '2021 [YR2021]', '2022 [YR2022]', '2023 [YR2023]', '2024 [YR2024]']


In [263]:
enrollment = filter_countries(enrollment, 'Country Name')
enrollment.head()

Original shape: (271, 19), Filtered shape: (65, 19)
Countries kept: 65


Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023],2024 [YR2024]
1,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Albania,ALB,93.681824,95.470528,97.608612,102.90321,103.321378,102.440729,99.170755,97.57437,98.822388,99.015154,95.929447,97.665033,97.322471,96.136877,108.355392
7,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Argentina,ARG,99.070999,100.88456,102.709991,104.540863,106.038902,107.504868,109.598328,110.3778,109.671242,110.460152,112.376732,115.735779,114.148117,105.574584,
8,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Armenia,ARM,107.732391,94.877403,,86.872192,86.520401,86.442139,,,83.024941,86.826063,88.50803,91.485374,91.710661,89.411103,90.945871
11,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Austria,AUT,100.263969,99.672958,100.108727,101.17321,101.30481,101.411583,100.227989,100.337898,100.692596,101.543633,101.690559,102.307663,101.458,100.504372,
17,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Belarus,BLR,111.028999,108.989867,107.657786,104.523124,103.510351,102.895731,102.969006,102.48085,101.596011,100.438744,100.57571,100.420639,99.060203,97.4266,94.79027


In [264]:
enrollment = enrollment.rename(columns={'2010 [YR2010]': '2010',
                           '2011 [YR2011]': '2011',
                           '2012 [YR2012]': '2012',
                           '2013 [YR2013]': '2013',
                           '2014 [YR2014]': '2014',
                           '2015 [YR2015]': '2015',
                           '2016 [YR2016]': '2016',
                           '2017 [YR2017]': '2017',
                           '2018 [YR2018]': '2018',
                           '2019 [YR2019]': '2019',
                           '2020 [YR2020]': '2020',
                           '2021 [YR2021]': '2021',
                           '2022 [YR2022]': '2022',
                           '2023 [YR2023]': '2023',
                           '2024 [YR2024]': '2024'})
enrollment.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
1,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Albania,ALB,93.681824,95.470528,97.608612,102.90321,103.321378,102.440729,99.170755,97.57437,98.822388,99.015154,95.929447,97.665033,97.322471,96.136877,108.355392
7,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Argentina,ARG,99.070999,100.88456,102.709991,104.540863,106.038902,107.504868,109.598328,110.3778,109.671242,110.460152,112.376732,115.735779,114.148117,105.574584,
8,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Armenia,ARM,107.732391,94.877403,,86.872192,86.520401,86.442139,,,83.024941,86.826063,88.50803,91.485374,91.710661,89.411103,90.945871
11,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Austria,AUT,100.263969,99.672958,100.108727,101.17321,101.30481,101.411583,100.227989,100.337898,100.692596,101.543633,101.690559,102.307663,101.458,100.504372,
17,"School enrollment, secondary (% gross)",SE.SEC.ENRR,Belarus,BLR,111.028999,108.989867,107.657786,104.523124,103.510351,102.895731,102.969006,102.48085,101.596011,100.438744,100.57571,100.420639,99.060203,97.4266,94.79027


In [265]:
enrollment.to_excel('preclean_data\\Secondary enrollment.xlsx', index=False)

In [266]:
transfers = pd.read_excel('raw_data\\Subsidies and other transfers.xls',
                          na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(transfers.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [267]:
transfers = filter_countries(transfers, 'Country Name')
transfers.head()

Original shape: (266, 69), Filtered shape: (65, 69)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,Subsidies and other transfers (% of expense),GC.XPN.TRFT.ZS,,,,,,,...,46.985575,51.979204,53.383339,53.402081,53.269567,54.700653,54.281205,58.738313,58.631946,
9,Argentina,ARG,Subsidies and other transfers (% of expense),GC.XPN.TRFT.ZS,,,,,,,...,65.4836,63.768041,64.403323,66.257258,67.201552,77.220404,75.086401,74.675824,68.763812,
10,Armenia,ARM,Subsidies and other transfers (% of expense),GC.XPN.TRFT.ZS,,,,,,,...,37.276318,46.09493,38.472619,40.211234,38.187382,41.130239,42.065928,39.777432,40.791654,
14,Austria,AUT,Subsidies and other transfers (% of expense),GC.XPN.TRFT.ZS,,,,,,,...,73.116848,73.433165,73.440133,74.075935,74.457594,75.905943,75.909688,73.89545,74.140412,
17,Belgium,BEL,Subsidies and other transfers (% of expense),GC.XPN.TRFT.ZS,,,,,,,...,81.665101,82.689391,83.416628,84.186162,84.583563,85.2964,85.741444,84.993667,,


In [268]:
transfers.to_excel('preclean_data\\Subsidies and other transfers.xlsx', index=False)

In [269]:
urban = pd.read_excel('raw_data\\Urban population (% of total population).xls',
                      na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(urban.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [270]:
urban = filter_countries(urban, 'Country Name')
urban.head()

Original shape: (266, 69), Filtered shape: (65, 69)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,Urban population (% of total population),SP.URB.TOTL.IN.ZS,30.705,30.943,31.015,31.086,31.158,31.23,...,57.434,58.421,59.383,60.319,61.229,62.112,62.969,63.799,64.603,65.38
9,Argentina,ARG,Urban population (% of total population),SP.URB.TOTL.IN.ZS,73.611,74.217,74.767,75.309,75.844,76.369,...,91.503,91.627,91.749,91.87,91.991,92.111,92.229,92.347,92.463,92.579
10,Armenia,ARM,Urban population (% of total population),SP.URB.TOTL.IN.ZS,51.275,52.147,53.019,53.889,54.758,55.622,...,63.085,63.082,63.103,63.149,63.219,63.313,63.431,63.573,63.739,63.929
14,Austria,AUT,Urban population (% of total population),SP.URB.TOTL.IN.ZS,64.72,64.814,64.863,64.913,64.962,65.011,...,57.715,57.905,58.094,58.297,58.515,58.748,58.995,59.256,59.53,59.819
17,Belgium,BEL,Urban population (% of total population),SP.URB.TOTL.IN.ZS,92.46,92.554,92.679,92.835,92.988,93.137,...,97.876,97.919,97.961,98.001,98.041,98.079,98.117,98.153,98.189,98.224


In [271]:
urban.to_excel('preclean_data\\Urban population (% of total population).xlsx', index=False)

In [272]:
precip = pd.read_excel('raw_data\\Average precipitation in depth (mm per year).xls',
                       na_values=['..', '…', '', 'NULL', 'null', 'NA', 'n/a', '#N/A'])
print(precip.columns.to_list())

['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']


In [273]:
precip = filter_countries(precip, 'Country Name')
precip.head()

Original shape: (266, 19), Filtered shape: (65, 19)
Countries kept: 65


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
5,Albania,ALB,Average precipitation in depth (mm per year),AG.LND.PRCP.MM,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,1485.0,,
9,Argentina,ARG,Average precipitation in depth (mm per year),AG.LND.PRCP.MM,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,591.0,,
10,Armenia,ARM,Average precipitation in depth (mm per year),AG.LND.PRCP.MM,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,562.0,,
14,Austria,AUT,Average precipitation in depth (mm per year),AG.LND.PRCP.MM,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,1110.0,,
17,Belgium,BEL,Average precipitation in depth (mm per year),AG.LND.PRCP.MM,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,847.0,,


In [274]:
precip.to_excel('preclean_data\\Average precipitation in depth (mm per year).xlsx', index=False)

In [279]:
lpi_2010 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2010')
print(lpi_2010.columns.to_list())

['Country', 'Code', 'score', 'lower bound', 'upper bound', 'rank', 'lower bound.1', 'upper bound.1', '% of highest performer', 'score.1', 'rank.1', 'score.2', 'rank.2', 'score.3', 'rank.3', 'score.4', 'rank.4', 'score.5', 'rank.5', 'score.6', 'rank.6']


  warn(msg)


In [280]:
lpi_2010 = lpi_2010[['Country', 'score']]
lpi_2010.shape

(155, 2)

In [281]:
lpi_2012 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2012')
print(lpi_2012.columns.to_list())

['Country', 'Code', 'score', 'lower bound', 'upper bound', 'rank', 'lower bound.1', 'upper bound.1', '% of highest performer', 'score.1', 'rank.1', 'score.2', 'rank.2', 'score.3', 'rank.3', 'score.4', 'rank.4', 'score.5', 'rank.5', 'score.6', 'rank.6']


  warn(msg)


In [282]:
lpi_2012 = lpi_2012[['Country', 'score']]
lpi_2012.shape

(155, 2)

In [283]:
lpi_2014 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2014')
print(lpi_2014.columns.to_list())

['Country', 'Code', 'score', 'lower bound', 'upper bound', 'rank', 'lower bound.1', 'upper bound.1', '% of highest performer', 'score.1', 'rank.1', 'score.2', 'rank.2', 'score.3', 'rank.3', 'score.4', 'rank.4', 'score.5', 'rank.5', 'score.6', 'rank.6']


  warn(msg)


In [284]:
lpi_2014 = lpi_2014[['Country', 'score']]
lpi_2014.shape

(160, 2)

In [285]:
lpi_2016 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2016')
print(lpi_2016.columns.to_list())

['Country', 'Code', 'score', 'lower bound', 'upper bound', 'rank', 'lower bound.1', 'upper bound.1', '% of highest performer', 'score.1', 'rank.1', 'score.2', 'rank.2', 'score.3', 'rank.3', 'score.4', 'rank.4', 'score.5', 'rank.5', 'score.6', 'rank.6']


  warn(msg)


In [286]:
lpi_2016 = lpi_2016[['Country', 'score']]
lpi_2016.shape

(160, 2)

In [287]:
lpi_2018 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2018')
print(lpi_2018.columns.to_list())

['Country', 'Code', 'score', 'lower bound', 'upper bound', 'rank', 'lower bound.1', 'upper bound.1', '% of highest performer', 'score.1', 'rank.1', 'score.2', 'rank.2', 'score.3', 'rank.3', 'score.4', 'rank.4', 'score.5', 'rank.5', 'score.6', 'rank.6']


  warn(msg)


In [288]:
lpi_2018 = lpi_2018[['Country', 'score']]
lpi_2018.shape

(160, 2)

In [289]:
lpi_2023 = pd.read_excel('raw_data\\Logistic performance index.xlsx', sheet_name='2023')
print(lpi_2023.columns.to_list())

['Economy', 'LPI Score', 'LPI Grouped Rank', 'Customs Score', 'Customs Grouped Rank', 'Infrastructure Score', 'Infrastructure Grouped Rank', 'International Shipments Score', 'International Shipments Grouped Rank', 'Logistics Competence and Quality Score', 'Logistics Competence and Quality Grouped Rank', 'Timeliness Score', 'Timeliness Grouped Rank', 'Tracking and Tracing Score', 'Tracking and Tracing Grouped Rank']


In [291]:
lpi_2023 = lpi_2023[['Economy', 'LPI Score']]
lpi_2023.shape

(139, 2)