# World Bank Ed Stats Model Building

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Open and read the CSV file to a Dataframe
data = pd.read_csv('EdStatsData.csv')
ind_codes = data['Indicator Code'].unique().tolist()
ind_names = data['Indicator Name'].unique().tolist()
indicator_dict = {k:v for (k,v) in zip(ind_codes, ind_names)}
#data.dropna(axis=1, thresh=75000, inplace=True)

In [2]:
data.drop(['Country Code', 'Indicator Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Country Name,Indicator Code,1970,1971,1972,1973,1974,1975,1976,1977,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,UIS.NERA.2,,,,,,,,,...,,,,,,,,,,
1,Arab World,UIS.NERA.2.F,,,,,,,,,...,,,,,,,,,,
2,Arab World,UIS.NERA.2.GPI,,,,,,,,,...,,,,,,,,,,
3,Arab World,UIS.NERA.2.M,,,,,,,,,...,,,,,,,,,,
4,Arab World,SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,60.999962,61.92268,...,,,,,,,,,,


In [3]:
#data.dropna(axis=0, thresh=10, inplace=True)

In [4]:
regions = ['Arab World', 'East Asia & Pacific', 'East Asia & Pacific (excluding high income)', 'Euro area', 'Europe & Central Asia', 
 'Europe & Central Asia (excluding high income)', 'European Union', 'Latin America & Caribbean', 'Latin America & Caribbean (excluding high income)', 
  'Middle East & North Africa', 'Middle East & North Africa (excluding high income)', 'Middle income', 'North America', 'South Asia', 
           'Sub-Saharan Africa', 'Sub-Saharan Africa (excluding high income)', 'OECD members', 'World']

income_levels = ['Heavily indebted poor countries (HIPC)', 'High income', 'Least developed countries: UN classification', 'Low & middle income', 
 'Low income', 'Lower middle income', 'Upper middle income']

reg_df = data[data['Country Name'].isin(regions)]

inc_df = data[data['Country Name'].isin(income_levels)]

cntry_df = data[~data['Country Name'].isin(regions)]
cntry_df = cntry_df[~cntry_df['Country Name'].isin(income_levels)]

del data

cntry_df.head()

Unnamed: 0,Country Name,Indicator Code,1970,1971,1972,1973,1974,1975,1976,1977,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
91625,Afghanistan,UIS.NERA.2,,,,,7.05911,,,,...,,,,,,,,,,
91626,Afghanistan,UIS.NERA.2.F,,,,,2.53138,,,,...,,,,,,,,,,
91627,Afghanistan,UIS.NERA.2.GPI,,,,,0.22154,,,,...,,,,,,,,,,
91628,Afghanistan,UIS.NERA.2.M,,,,,11.42652,,,,...,,,,,,,,,,
91629,Afghanistan,SE.PRM.TENR,,,,,,,,,...,,,,,,,,,,


In [5]:
cntry_df.set_index(['Country Name', 'Indicator Code'], inplace=True)

In [6]:
df_t = cntry_df.transpose()

In [7]:
df_t = df_t.stack('Country Name')
df_t.head()

Unnamed: 0_level_0,Indicator Code,BAR.NOED.1519.FE.ZS,BAR.NOED.1519.ZS,BAR.NOED.15UP.FE.ZS,BAR.NOED.15UP.ZS,BAR.NOED.2024.FE.ZS,BAR.NOED.2024.ZS,BAR.NOED.2529.FE.ZS,BAR.NOED.2529.ZS,BAR.NOED.25UP.FE.ZS,BAR.NOED.25UP.ZS,...,UIS.XUNIT.US.4.FSGOV,UIS.XUNIT.US.56.FSGOV,UIS.XUNIT.USCONST.1.FSGOV,UIS.XUNIT.USCONST.2.FSGOV,UIS.XUNIT.USCONST.23.FSGOV,UIS.XUNIT.USCONST.3.FSGOV,UIS.XUNIT.USCONST.4.FSGOV,UIS.XUNIT.USCONST.56.FSGOV,XGDP.23.FSGOV.FDINSTADM.FFD,XGDP.56.FSGOV.FDINSTADM.FFD
Unnamed: 0_level_1,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1970,Afghanistan,91.44,77.08,97.21,88.81,94.8,78.4,98.6,91.09,99.25,94.22,...,,,,,,,,,,
1970,Albania,26.56,21.7,41.88,37.92,28.2,28.38,31.77,28.91,48.28,43.8,...,,,,,,,,,,
1970,Algeria,69.7,52.9,87.39,73.64,69.7,52.9,91.5,77.3,95.9,84.4,...,,,,,,,,,,
1970,American Samoa,,,,,,,,,,,...,,,,,,,,,,
1970,Andorra,,,,,,,,,,,...,,,,,,,,,,


In [8]:
df_t = df_t.unstack(0).stack()
df_t.head()

Unnamed: 0_level_0,Indicator Code,BAR.NOED.1519.FE.ZS,BAR.NOED.1519.ZS,BAR.NOED.15UP.FE.ZS,BAR.NOED.15UP.ZS,BAR.NOED.2024.FE.ZS,BAR.NOED.2024.ZS,BAR.NOED.2529.FE.ZS,BAR.NOED.2529.ZS,BAR.NOED.25UP.FE.ZS,BAR.NOED.25UP.ZS,...,UIS.XUNIT.US.4.FSGOV,UIS.XUNIT.US.56.FSGOV,UIS.XUNIT.USCONST.1.FSGOV,UIS.XUNIT.USCONST.2.FSGOV,UIS.XUNIT.USCONST.23.FSGOV,UIS.XUNIT.USCONST.3.FSGOV,UIS.XUNIT.USCONST.4.FSGOV,UIS.XUNIT.USCONST.56.FSGOV,XGDP.23.FSGOV.FDINSTADM.FFD,XGDP.56.FSGOV.FDINSTADM.FFD
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,1970,91.44,77.08,97.21,88.81,94.8,78.4,98.6,91.09,99.25,94.22,...,,,,,,,,,,
Afghanistan,1971,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1972,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1973,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1974,,,,,,,,,,,...,,,,,,,,,,


In [9]:
idx = df_t.columns.str.split('.', 1, expand=True)
#df_t.columns = idx
#idx = pd.MultiIndex.from_product([idx.levels[0], idx.levels[1]])

In [10]:
igroup_df = df_t.reindex(columns=idx)

In [11]:
igroup_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,BAR,BAR,BAR,BAR,BAR,BAR,BAR,BAR,BAR,BAR,...,UIS,UIS,UIS,UIS,UIS,UIS,UIS,UIS,XGDP,XGDP
Unnamed: 0_level_1,Unnamed: 1_level_1,NOED.1519.FE.ZS,NOED.1519.ZS,NOED.15UP.FE.ZS,NOED.15UP.ZS,NOED.2024.FE.ZS,NOED.2024.ZS,NOED.2529.FE.ZS,NOED.2529.ZS,NOED.25UP.FE.ZS,NOED.25UP.ZS,...,XUNIT.US.4.FSGOV,XUNIT.US.56.FSGOV,XUNIT.USCONST.1.FSGOV,XUNIT.USCONST.2.FSGOV,XUNIT.USCONST.23.FSGOV,XUNIT.USCONST.3.FSGOV,XUNIT.USCONST.4.FSGOV,XUNIT.USCONST.56.FSGOV,23.FSGOV.FDINSTADM.FFD,56.FSGOV.FDINSTADM.FFD
Country Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Afghanistan,1970,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1971,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1972,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1973,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,1974,,,,,,,,,,,...,,,,,,,,,,


In [12]:
#pd.options.display.max_rows = None
igroup_df.isna().sum()

BAR   NOED.1519.FE.ZS            12997
      NOED.1519.ZS               12997
      NOED.15UP.FE.ZS            12997
      NOED.15UP.ZS               12997
      NOED.2024.FE.ZS            12997
      NOED.2024.ZS               12997
      NOED.2529.FE.ZS            12997
      NOED.2529.ZS               12997
      NOED.25UP.FE.ZS            12997
      NOED.25UP.ZS               12997
      NOED.3034.FE.ZS            12997
      NOED.3034.ZS               12997
      NOED.3539.FE.ZS            12997
      NOED.3539.ZS               12997
      NOED.4044.FE.ZS            12997
      NOED.4044.ZS               12997
      NOED.4549.FE.ZS            12997
      NOED.4549.ZS               12997
      NOED.5054.FE.ZS            12997
      NOED.5054.ZS               12997
      NOED.5559.FE.ZS            12997
      NOED.5559.ZS               12997
      NOED.6064.FE.ZS            12997
      NOED.6064.ZS               12997
      NOED.6569.FE.ZS            12997
      NOED.6569.ZS       

In [None]:
def missingness_percentage(df):
    missingness_dict = {}
    for key in df.columns:
        l = df[key].size
        missingness_dict[key] = df[key].isna().sum() / l
    return missingness_dict

In [None]:
cntry_miss = missingness_percentage(igroup_df)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

In [None]:
sklearn_pca = PCA(n_components=10)
Y_sklearn = sklearn_pca.fit_transform(df_t)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_)