# Pull data from World Bank WDI and save to local CSV

In [1]:
"""
Pull data from World Bank WDI and save to local CSV

Dependencies:
    pip install wbgapi pandas
"""

import wbgapi as wb
import pandas as pd
import os

def fetch_and_save_wdi(
    indicators: list,
    countries: list = None,
    years: list = None,
    output_dir: str = "data/raw",
    output_filename: str = "wdi_data.csv"
):
    """
    Fetch World Bank WDI data and save as CSV.

    Parameters:
        indicators: List of indicator codes, e.g. ["NY.GDP.PCAP.KD", "SL.EMP.TOTL.SP.ZS", "NY.GDP.MKTP.KD.ZG"]
        countries: List of country ISO-2/ISO-3 codes, default None means all available countries
        years: List of year range, e.g. list(range(2000, 2024))
        output_dir: Local save directory
        output_filename: Output file name
    """
    os.makedirs(output_dir, exist_ok=True)

    if countries is None:
        countries = wb.economy.list()  # Get codes for all economies

    # If years not specified, use 1960–latest
    if years is None:
        years = list(range(1960, pd.Timestamp.now().year + 1))

    # Fetch data: DataFrame with row index (economy, year), columns as indicators
    df = wb.data.DataFrame(
        indicators,
        economy=countries,
        time=years,
        labels=True
    )

    # Reset index, convert economy and time to regular columns
    df = df.reset_index().rename(columns={"economy": "country", "time": "year"})

    # Save as CSV (no index column)
    output_path = os.path.join(output_dir, output_filename)
    df.to_csv(output_path, index=False, encoding="utf-8-sig")

    print(f"Saved WDI data to {output_path}")

if __name__ == "__main__":
    indicators = [
        "NY.GDP.PCAP.KD",
        "SL.EMP.TOTL.SP.ZS",
        "NY.GDP.MKTP.KD.ZG"
    ]

    # ["USA","CHN","IND"]
    countries = ["USA", "CHN", "IND"]

    # Specify year range
    years = list(range(2000, 2024))

    # Execute fetch and save
    fetch_and_save_wdi(
        indicators=indicators,
        countries=countries,
        years=years,
        output_dir="../data/raw",
        output_filename="economic_dev_2000_2023.csv"
    )

Saved WDI data to ../data/raw/economic_dev_2000_2023.csv


# Data cleaning

In [2]:
"""
Use DuckDB in Python to perform SQL cleaning/unpivoting of WDI wide table, and export long-format CSV.
Dependencies:
    pip install duckdb pandas
"""

import duckdb
import pandas as pd
import os

RAW_CSV        = "../data/raw/economic_dev_2000_2023.csv"
OUTPUT_CSV     = "../data/clean/wdi_long_clean.csv"
DB_FILE        = "../data/tmp/wdi.duckdb"

os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
os.makedirs(os.path.dirname(DB_FILE), exist_ok=True)

# 1) Connect to DuckDB (file storage), or create in-memory database with ":memory:"
con = duckdb.connect(database=DB_FILE, read_only=False)

# 2) Read CSV into a DuckDB table raw_widi, rename to avoid case conflicts in SELECT
con.execute(f"""
CREATE OR REPLACE TABLE raw_wdi AS
SELECT
    country        AS country_code,
    series         AS indicator_code,
    "Country"      AS country_name,
    "Series"       AS indicator_name,
    YR2000, YR2001, YR2002, YR2003, YR2004, YR2005,
    YR2006, YR2007, YR2008, YR2009, YR2010, YR2011,
    YR2012, YR2013, YR2014, YR2015, YR2016, YR2017,
    YR2018, YR2019, YR2020, YR2021, YR2022, YR2023
FROM read_csv_auto('{RAW_CSV}');
""")

# 3) Use SQL UNPIVOT to transform wide table to long table, filter out NULLs
con.execute("""
CREATE OR REPLACE TABLE clean_wdi AS
SELECT
    country_code,
    indicator_code,
    country_name,
    indicator_name,
    CAST(REPLACE(col, 'YR', '') AS INTEGER) AS year,
    value
FROM raw_wdi
UNPIVOT (
    value FOR col IN (
        YR2000, YR2001, YR2002, YR2003, YR2004, YR2005,
        YR2006, YR2007, YR2008, YR2009, YR2010, YR2011,
        YR2012, YR2013, YR2014, YR2015, YR2016, YR2017,
        YR2018, YR2019, YR2020, YR2021, YR2022, YR2023
    )
)
WHERE value IS NOT NULL
ORDER BY country_code, indicator_code, year;
""")

# 4) Export cleaned long table to local CSV
df_clean = con.execute("SELECT * FROM clean_wdi").df()
df_clean.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

con.close()
print("Clean data saved to", OUTPUT_CSV)

Clean data saved to ../data/clean/wdi_long_clean.csv
