# NO2 and economic activity model

## Addis-Ababa Random Forest + SHAP
Here we load **all 730** daily meshes for Addis, build lag & neighbor features, train a global RF, and visualize SHAP.


#### Imports, constants & helpers

In [2]:
import sys
from pathlib import Path

import pandas as pd
import geopandas as gpd
import numpy as np
import shap

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# bring src/ into path
CURR_PATH = Path().resolve()
REPO_PATH = CURR_PATH.parent
sys.path.append(str(REPO_PATH / "src"))

# our feature-engineering helpers
from feature_engineering import load_mesh_series, make_lag_features, NeighborAggregator

# Constants
ADDIS_FOLDER = Path(
    r"C:\Users\Luis.ParraMorales\AirPollution_Analysis"
    r"\air-pollution-mobility-research-project\data"
    r"\Populated meshes\addis-mesh-data"
)
NLAGS   = 7     # cut in half for speed, can tune
K_NEIGH = 8     # number of neighbours


#### Load & build lag features

In [3]:
# load all daily meshes
gdf = load_mesh_series(ADDIS_FOLDER)
print("Total rows:", len(gdf))

Total rows: 399126


### Create autoregressive lags 1…14 days


In [4]:
df = make_lag_features(gdf, nlags=NLAGS)
df = df.dropna(subset=[f"no2_mean_lag{i}" for i in range(1, NLAGS+1)])
print("Shape after lags:", df.shape)

Shape after lags: (341292, 11)


#### Vectorised neighbour aggregation

In [6]:
# static geometry (center points)
static = gdf.drop_duplicates(["geom_id"])[["geom_id","geometry"]]

# fit neighbour index
neighborer = NeighborAggregator(k=K_NEIGH, id_col="geom_id")
neighborer.fit(static.reset_index(), None)

# 1) unique (geom_id, date) pairs
df_center = df[["geom_id","date"]].drop_duplicates()

# 2) build edge list (center → neighbour)
edges = pd.DataFrame({
    "geom_id": np.repeat(neighborer.ids_, K_NEIGH),
    "neigh_id": neighborer.ids_[neighborer.neigh_idx.ravel()]
})

# 3) cross-join to assign dates to each edge
edges_date = df_center.merge(edges, on="geom_id")

# 4) join lag columns for each neighbour
lag_cols = [f"no2_mean_lag{i}" for i in range(1, NLAGS+1)]
df_lags  = df[["geom_id","date"] + lag_cols]

# this merge will create geom_id_x (center) and geom_id_y (neigh)
df_nei   = edges_date.merge(
    df_lags,
    left_on=["neigh_id","date"],
    right_on=["geom_id","date"],
    how="left"
)

# 5) rename & drop so we group by the *center* geom_id
df_nei = (
    df_nei
    .rename(columns={"geom_id_x": "geom_id"})   # center id
    .drop(columns=["geom_id_y", "neigh_id"])    # drop the neighbour id & duplicate
)

# 6) aggregate the neighbour lags
neigh_feats = (
    df_nei
    .groupby(["geom_id","date"])[lag_cols]
    .mean()
    .rename(columns=lambda c: f"neigh_{c}")
    .reset_index()
)

# 7) merge back onto the full df
df_full = df.merge(neigh_feats, on=["geom_id","date"], how="left")
print("Shape with neighbour feats:", df_full.shape)


Shape with neighbour feats: (341292, 18)


#### Train/test split

In [7]:
df_full["year"] = df_full["date"].dt.year

train = df_full[df_full["year"] < 2024].dropna(subset=["no2_mean"])
test  = df_full[df_full["year"] >= 2024].dropna(subset=["no2_mean"])

# drop columns not used as features
drop_cols = ["no2_mean","geometry","date","year"]
X_cols = [c for c in train.columns if c not in drop_cols]

X_train, y_train = train[X_cols], train["no2_mean"]
X_test,  y_test  = test[X_cols],  test["no2_mean"]

print("Num features:", len(X_cols))
print(X_train.dtypes.value_counts())

Num features: 15
float64    14
int64       1
Name: count, dtype: int64


### Random Forest Pipeline

In [8]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        n_jobs=-1,
        random_state=42,
        oob_score=True
    ))
])

pipeline.fit(X_train, y_train)
print("OOB R²:", pipeline.named_steps["rf"].oob_score_)
print("Test R²:", pipeline.score(X_test, y_test))

OOB R²: 0.72995726580713
Test R²: 0.17453496300736593


### Fast SHAP on a sub-sample

In [None]:
explainer = shap.TreeExplainer(pipeline.named_steps["rf"])
shap_values = explainer.shap_values(X_test)

# bar plot
shap.plots.bar(explainer, max_display=15)
# beeswarm
shap.summary_plot(shap_values, X_test, max_display=15)


### Dependence plot for top-2 features, and mapping one SHAP feature back to space


In [None]:
# pick background & explain sets
X_bg   = X_train.sample(1000, random_state=0)
X_expl = X_test.sample(800,  random_state=1)

explainer = shap.TreeExplainer(
    pipeline.named_steps["rf"],
    data=X_bg,
    feature_perturbation="interventional"   # fast, path-dependent
)

shap_vals = explainer.shap_values(X_expl)

# global plots
shap.plots.bar(shap_vals, max_display=15)
shap.plots.beeswarm(shap_vals, max_display=15)

# dependence for top-2
mean_abs = np.abs(shap_vals).mean(0)
top2 = np.array(X_expl.columns)[np.argsort(mean_abs)[-2:]]
for feat in top2:
    shap.dependence_plot(feat, shap_vals, X_expl)

# map mean SHAP of the top feature
shap_df = pd.DataFrame(shap_vals, columns=X_expl.columns, index=X_expl.index)
shap_df["geom_id"] = test.loc[X_expl.index, "geom_id"]
mean_shap = shap_df.groupby("geom_id")[top2[-1]].mean().reset_index()

map_gdf = static.merge(mean_shap, on="geom_id")
map_gdf.plot(column=top2[-1], legend=True, cmap="plasma")


### Approximate elasticities from SHAP


In [None]:
# compute elasticities on the SAME 800‐row subsample we explained
X_sub = X_expl.copy()
y_pred = pipeline.predict(X_sub)
dxs    = X_sub.quantile(0.75) - X_sub.quantile(0.25)

# vectorised: 
#   shap_vals has shape (800, n_features)
#   y_pred is (800,)
#   X_sub is (800, n_features)
rels = (shap_vals / y_pred[:, None]) * (X_sub / dxs[None, :])

elasticities = pd.DataFrame({
    "feature": X_sub.columns,
    "median": np.median(rels, axis=0),
    "p10":     np.percentile(rels, 10, axis=0),
    "p90":     np.percentile(rels, 90, axis=0)
})
elasticities.sort_values("median", ascending=False).head(10)
