# Imports & setup

In [29]:
import os
import json
from datetime import datetime
import requests
import pandas as pd

BASE_URL = "https://ghoapi.azureedge.net/api"  # WHO GHO OData API
PROJECT_ROOT = ".."  # adjust if needed

RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

session = requests.Session()
session.headers.update({
    "User-Agent": "who-air-pollution-portfolio/1.0 (data-collection)"
})

print("Folders ready:", RAW_DIR, PROCESSED_DIR)


Folders ready: ../data/raw ../data/processed


In [2]:
# Cell 2 — Helper: robust GET + pagination for WHO OData

def who_get(url: str, params: dict | None = None, timeout: int = 60) -> dict:
    r = session.get(url, params=params, timeout=timeout)
    r.raise_for_status()
    return r.json()


def fetch_odata_all(endpoint: str, params: dict | None = None, page_size: int = 1000, max_pages: int = 2000) -> list[dict]:
    """
    Fetch all rows from a WHO GHO OData endpoint using $top and $skip pagination.
    endpoint: full URL like f"{BASE_URL}/Indicator" or f"{BASE_URL}/<INDICATOR_CODE>"
    """
    params = dict(params or {})
    params.setdefault("$top", page_size)

    all_rows = []
    skip = 0
    pages = 0

    while True:
        pages += 1
        if pages > max_pages:
            raise RuntimeError("Too many pages — stopping to avoid infinite loop.")

        params["$skip"] = skip
        payload = who_get(endpoint, params=params)
        rows = payload.get("value", [])

        all_rows.extend(rows)

        # stop condition
        if len(rows) < page_size:
            break

        skip += page_size

    return all_rows


In [3]:
# Cell 3 — Search the indicator catalog to find the RIGHT indicator codes
# Tip: run this first, inspect results, pick your indicator(s)

def search_indicators(keyword: str, top: int = 50) -> pd.DataFrame:
    """
    Search Indicator catalog for keyword in IndicatorName.
    """
    # OData function 'contains' works on many OData services; if it fails,
    # you can fallback to pulling a bigger list and filtering in pandas.
    params = {
        "$filter": f"contains(IndicatorName,'{keyword}')",
        "$select": "IndicatorCode,IndicatorName",
        "$top": top
    }
    rows = fetch_odata_all(f"{BASE_URL}/Indicator", params=params, page_size=min(top, 1000))
    return pd.DataFrame(rows).sort_values("IndicatorName").reset_index(drop=True)

# Try a few searches:
df_pm = search_indicators("particulate", top=50)
df_air = search_indicators("air pollution", top=50)
df_air

Unnamed: 0,IndicatorCode,IndicatorName
0,AIR_10,Ambient air pollution attributable DALYs per ...
1,AIR_6,Ambient air pollution attributable deaths per...
2,AIR_7,Ambient air pollution attributable DALYs
3,AIR_43,Ambient air pollution attributable DALYs
4,AIR_9,Ambient air pollution attributable DALYs (per...
5,AIR_90,Ambient air pollution attributable DALYs (per...
6,AIR_8,Ambient air pollution attributable DALYs in c...
7,AIR_71,Ambient air pollution attributable DALYs in ch...
8,AIR_73,Ambient air pollution attributable YLL in chil...
9,AIR_45,Ambient air pollution attributable YLLs


In [4]:
PM25_CODE = "SDGPM25"
DEATH_RATE_CODE = "AIR_5"

In [5]:
def download_indicator(indicator_code: str) -> list[dict]:
    url = f"{BASE_URL}/{indicator_code}"
    return fetch_odata_all(url, params={}, page_size=1000)

def save_raw(rows: list[dict], name: str) -> str:
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    path = os.path.join(RAW_DIR, f"{name}_{ts}.json")
    with open(path, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)
    return path

pm25_rows = download_indicator(PM25_CODE)
death_rows = download_indicator(DEATH_RATE_CODE)

pm25_path = save_raw(pm25_rows, PM25_CODE)
death_path = save_raw(death_rows, DEATH_RATE_CODE)

print("Downloaded rows:", len(pm25_rows), len(death_rows))
print("Saved to:", pm25_path, death_path)


  ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


Downloaded rows: 10750 9258
Saved to: ../data/raw/SDGPM25_20260110_131819.json ../data/raw/AIR_5_20260110_131819.json


In [15]:
def tidy_who_indicator(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert WHO indicator raw DataFrame into a tidy format
    while preserving essential metadata (e.g., SpatialDimType).
    """
    df = df.copy()

    # Choose the numeric value column
    value_col = "NumericValue" if "NumericValue" in df.columns else "Value"

    # Base columns we always want (if present)
    base_cols = ["SpatialDim", "SpatialDimType", "TimeDim", value_col]
    keep_cols = [c for c in base_cols if c in df.columns]

    # Keep all dimension columns (Dim1, Dim2, Dim3, ...)
    dim_cols = [c for c in df.columns if c.startswith("Dim")]
    keep_cols += dim_cols

    # Rename to friendly names
    out = df[keep_cols].rename(columns={
        "SpatialDim": "country_code",
        "TimeDim": "year",
        value_col: "value"
    })

    # Clean data types
    out["year"] = pd.to_numeric(out["year"], errors="coerce")
    out["value"] = pd.to_numeric(out["value"], errors="coerce")

    # Drop rows missing core fields
    out = out.dropna(subset=["country_code", "year", "value"]).reset_index(drop=True)

    return out



# Convert raw rows → DataFrames
pm25_df = pd.DataFrame(pm25_rows)
death_df = pd.DataFrame(death_rows)

# Tidy
pm25_tidy = tidy_who_indicator(pm25_df)
death_tidy = tidy_who_indicator(death_df)

# Save processed CSVs
pm25_csv = os.path.join(PROCESSED_DIR, "pm25_tidy.csv")
death_csv = os.path.join(PROCESSED_DIR, "air_pollution_death_rate_tidy.csv")

pm25_tidy.to_csv(pm25_csv, index=False)
death_tidy.to_csv(death_csv, index=False)

print("Saved:", pm25_csv, death_csv)
print(pm25_tidy.head())
print(death_tidy.head())


Saved: ../data/processed/pm25_tidy.csv ../data/processed/air_pollution_death_rate_tidy.csv
  country_code SpatialDimType  year     value           Dim1Type  \
0          AFG        COUNTRY  2010  61.81464  RESIDENCEAREATYPE   
1          HRV        COUNTRY  2019  15.48858  RESIDENCEAREATYPE   
2          LBN        COUNTRY  2013  24.41514  RESIDENCEAREATYPE   
3          URY        COUNTRY  2019   8.23782  RESIDENCEAREATYPE   
4          LBN        COUNTRY  2015  22.72126  RESIDENCEAREATYPE   

                     Dim1 Dim2Type  Dim2 Dim3Type  Dim3  
0   RESIDENCEAREATYPE_RUR     None  None     None  None  
1   RESIDENCEAREATYPE_URB     None  None     None  None  
2  RESIDENCEAREATYPE_CITY     None  None     None  None  
3   RESIDENCEAREATYPE_RUR     None  None     None  None  
4   RESIDENCEAREATYPE_RUR     None  None     None  None  
                   country_code SpatialDimType  year    value Dim1Type  \
0                            30       UNREGION  2019  0.05425      SEX   
1   

In [16]:
pm25_f = pm25_tidy[pm25_tidy["Dim1"] == "RESIDENCEAREATYPE_TOTL"].copy()
pm25_f = pm25_f[["country_code", "year", "value"]].rename(columns={"value": "pm25"})

print("PM2.5 filtered shape:", pm25_f.shape)
pm25_f.head()

PM2.5 filtered shape: (2210, 3)


Unnamed: 0,country_code,year,pm25
7,COG,2019,29.48397
22,NPL,2018,40.05945
23,VEN,2019,16.21392
24,EST,2014,7.84577
26,BFA,2016,43.05181


In [17]:
death_tidy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9258 entries, 0 to 9257
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country_code    9258 non-null   object 
 1   SpatialDimType  9258 non-null   object 
 2   year            9258 non-null   int64  
 3   value           9258 non-null   float64
 4   Dim1Type        9258 non-null   object 
 5   Dim1            9258 non-null   object 
 6   Dim2Type        9258 non-null   object 
 7   Dim2            9258 non-null   object 
 8   Dim3Type        5964 non-null   object 
 9   Dim3            5964 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 723.4+ KB


In [18]:
death_tidy["Dim3"].value_counts().head(30)


Dim3
ENVCAUSE_ENVCAUSE039    1512
ENVCAUSE_ENVCAUSE114    1092
ENVCAUSE_ENVCAUSE118    1092
ENVCAUSE_ENVCAUSE068    1092
ENVCAUSE_ENVCAUSE113    1092
ENVCAUSE_ENVCAUSE000      84
Name: count, dtype: int64

In [26]:
death_f = death_tidy[
    (death_tidy["SpatialDimType"] == "COUNTRY") &
    (death_tidy["Dim1"] == "SEX_BTSX") &
    (death_tidy["Dim2"] == "ENVCAUSE_ENVCAUSE000")
].copy()

death_f = death_f[["country_code", "year", "value"]].rename(
    columns={"value": "death_rate"}
)

print("Death-rate filtered shape:", death_f.shape)
death_f.head()


Death-rate filtered shape: (183, 3)


Unnamed: 0,country_code,year,death_rate
24,SLB,2019,28.8246
90,FSM,2019,32.6839
143,COL,2019,25.98715
175,TGO,2019,39.35098
245,QAT,2019,15.12658


In [28]:
merged = pm25_f.merge(
    death_f,
    on=["country_code", "year"],
    how="inner"
)

print("Merged shape:", merged.shape)
merged.head()


Merged shape: (183, 4)


Unnamed: 0,country_code,year,pm25,death_rate
0,COG,2019,29.48397,26.246
1,VEN,2019,16.21392,30.17941
2,TUR,2019,23.25106,42.32768
3,FSM,2019,7.78563,32.6839
4,KOR,2019,24.03774,37.64416
