## Getting Data

In [2]:
# WDI package
!pip install wbgapi

import wbgapi as wb
import pandas as pd



In [12]:
# school enrollment variables
indicators = {
    'SE.PRM.ENRR'      : 'Primary_total',
    'SE.PRM.ENRR.MA'   : 'Primary_male',
    'SE.PRM.ENRR.FE'   : 'Primary_female',

    'SE.SEC.ENRR'      : 'Secondary_total',
    'SE.SEC.ENRR.MA'   : 'Secondary_male',
    'SE.SEC.ENRR.FE'   : 'Secondary_female',

    'SE.TER.ENRR'      : 'Tertiary_total',
    'SE.TER.ENRR.MA'   : 'Tertiary_male',
    'SE.TER.ENRR.FE'   : 'Tertiary_female'
}

# data for years 2000â€“2020
df = wb.data.DataFrame(
    list(indicators.keys()),
    time=range(2000, 2021),
    labels=True
)

df = df.rename(columns=indicators).reset_index()
df = df.drop(columns=['economy', 'Series'])
cols = ['Country'] + [c for c in df.columns if c != 'Country']
df = df[cols]
df = df.sort_values(by='Country').reset_index(drop=True)

df.to_csv("data/wdi_edu.csv", index=False)


## Cleaning

In [4]:
# drop columns with NA
df_clean = df.dropna().copy()

# replace primary, secondary, tertiary labels
mapping = {
    "SE.PRM.ENRR": "primary",
    "SE.PRM.ENRR.FE": "primary_female",
    "SE.PRM.ENRR.MA": "primary_male",
    "SE.SEC.ENRR": "secondary",
    "SE.SEC.ENRR.FE": "secondary_female",
    "SE.SEC.ENRR.MA": "secondary_male",
    "SE.TER.ENRR": "tertiary",
    "SE.TER.ENRR.FE": "tertiary_female",
    "SE.TER.ENRR.MA": "tertiary_male",
}

df_clean["series"] = df_clean["series"].replace(mapping)
df_clean.to_csv("data/wdi_edu_clean.csv", index=False)

## Filter for Analysis

In [9]:
target_regions = [
    "Sub-Saharan Africa",
    "South Asia",
    "Middle East, North Africa, Afghanistan & Pakistan",
    "Latin America & Caribbean",
    "East Asia & Pacific",
    "Europe & Central Asia",
    "North America",
    "World"
]

df_filtered = (
    df_clean[df_clean["Country"].isin(target_regions)]
    .rename(columns={"Country": "region"})
    .sort_values(["region", "series"])
    .reset_index(drop=True)
)

value_cols = [c for c in df_filtered.columns if c not in ["region", "series"]]
df_long = df_filtered.melt(
    id_vars=["region", "series"],
    value_vars=value_cols,
    var_name="year",
    value_name="value"
)

df_long["year"] = df_long["year"].str.replace("YR", "").astype(int)
df_long = df_long.sort_values(["region", "series", "year"])

df_long.to_csv("data/wdi_edu_filtered_long.csv", index=False)