Imports & setup

In [3]:
import os
import json
from datetime import datetime
import requests
import pandas as pd

BASE_URL = "https://ghoapi.azureedge.net/api"  # WHO GHO OData API
PROJECT_ROOT = ".."  # adjust if needed

RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

session = requests.Session()
session.headers.update({
    "User-Agent": "who-air-pollution-portfolio/1.0 (data-collection)"
})

print("Folders ready:", RAW_DIR, PROCESSED_DIR)


Folders ready: ../data/raw ../data/processed


In [4]:
# Cell 2 — Helper: robust GET + pagination for WHO OData

def who_get(url: str, params: dict | None = None, timeout: int = 60) -> dict:
    r = session.get(url, params=params, timeout=timeout)
    r.raise_for_status()
    return r.json()

def fetch_odata_all(endpoint: str, params: dict | None = None, page_size: int = 1000, max_pages: int = 2000) -> list[dict]:
    """
    Fetch all rows from a WHO GHO OData endpoint using $top and $skip pagination.
    endpoint: full URL like f"{BASE_URL}/Indicator" or f"{BASE_URL}/<INDICATOR_CODE>"
    """
    params = dict(params or {})
    params.setdefault("$top", page_size)

    all_rows = []
    skip = 0
    pages = 0

    while True:
        pages += 1
        if pages > max_pages:
            raise RuntimeError("Too many pages — stopping to avoid infinite loop.")

        params["$skip"] = skip
        payload = who_get(endpoint, params=params)
        rows = payload.get("value", [])

        all_rows.extend(rows)

        # stop condition
        if len(rows) < page_size:
            break

        skip += page_size

    return all_rows


In [5]:
# Cell 3 — Search the indicator catalog to find the RIGHT indicator codes
# Tip: run this first, inspect results, pick your indicator(s)

def search_indicators(keyword: str, top: int = 50) -> pd.DataFrame:
    """
    Search Indicator catalog for keyword in IndicatorName.
    """
    # OData function 'contains' works on many OData services; if it fails,
    # you can fallback to pulling a bigger list and filtering in pandas.
    params = {
        "$filter": f"contains(IndicatorName,'{keyword}')",
        "$select": "IndicatorCode,IndicatorName",
        "$top": top
    }
    rows = fetch_odata_all(f"{BASE_URL}/Indicator", params=params, page_size=min(top, 1000))
    return pd.DataFrame(rows).sort_values("IndicatorName").reset_index(drop=True)

# Try a few searches:
df_pm = search_indicators("particulate", top=50)
df_air = search_indicators("air pollution", top=50)
df_pm.head(20), df_air.head(20)


(  IndicatorCode                                      IndicatorName
 0       SDGPM25  Concentrations of fine particulate matter (PM2.5)
 1         OCC_6  Occupational airborne particulates attributabl...
 2         OCC_8  Occupational airborne particulates attributabl...
 3         OCC_5  Occupational airborne particulates attributabl...
 4         OCC_7  Occupational airborne particulates attributabl...,
    IndicatorCode                                      IndicatorName
 0         AIR_10  Ambient air pollution  attributable DALYs per ...
 1          AIR_6  Ambient air pollution  attributable deaths per...
 2          AIR_7           Ambient air pollution attributable DALYs
 3         AIR_43           Ambient air pollution attributable DALYs
 4          AIR_9  Ambient air pollution attributable DALYs  (per...
 5         AIR_90  Ambient air pollution attributable DALYs  (per...
 6          AIR_8  Ambient air pollution attributable DALYs  in c...
 7         AIR_71  Ambient air pollutio