# NO2 and economic activity model

## Addis-Ababa Random Forest + SHAP
Here we load **all 730** daily meshes for Addis, build lag & neighbor features, train a global RF, and visualize SHAP.


#### Imports, constants & helpers

In [14]:
import sys
from pathlib import Path

import pandas as pd
import geopandas as gpd
import numpy as np
import shap

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# bring src/ into path
CURR_PATH = Path().resolve()
REPO_PATH = CURR_PATH.parent
sys.path.append(str(REPO_PATH / "src"))

# our feature-engineering helpers
from feature_engineering import load_mesh_series, make_lag_features, NeighborAggregator
from feature_engineering import (
    train_rf_pipeline,
    evaluate_model,
    explain_shap,
    plot_shap_dependence,
    compute_elasticities_shap
)

# define the extra numeric features you want:
FEATURE_COLS = [
    "pop_sum_m", "NTL_mean", "road_len", "poi_count",
    "lu_industrial_area", "lu_commercial_area",
    "lu_residential_area","lu_retail_area",
    "lu_farmland_area",  "lu_farmyard_area",
]

# Constants
ADDIS_FOLDER = Path(
    r"C:\Users\Luis.ParraMorales\AirPollution_Analysis"
    r"\air-pollution-mobility-research-project\data"
    r"\Populated meshes\addis-mesh-data"
)
NLAGS   = 7     # cut in half for speed, can tune
K_NEIGH = 8     # number of neighbours


#### Load & build lag features

In [2]:
# load all daily meshes
gdf = load_mesh_series(ADDIS_FOLDER, features=FEATURE_COLS)
print("Total rows:", len(gdf))

  df = pd.concat(records, ignore_index=True)


Total rows: 399126


### Create autoregressive lags 1…14 days


In [3]:
df = make_lag_features(gdf, nlags=NLAGS)
df = df.dropna(subset=[f"no2_mean_lag{i}" for i in range(1, NLAGS+1)])
print("Shape after lags:", df.shape)

Shape after lags: (341292, 21)


#### Vectorised neighbour aggregation

In [4]:
# static geometry (center points)
static = gdf.drop_duplicates(["geom_id"])[["geom_id","geometry"]]

# fit neighbour index
neighborer = NeighborAggregator(k=K_NEIGH, id_col="geom_id")
neighborer.fit(static.reset_index(), None)

# 1) unique (geom_id, date) pairs
df_center = df[["geom_id","date"]].drop_duplicates()

# 2) build edge list (center → neighbour)
edges = pd.DataFrame({
    "geom_id": np.repeat(neighborer.ids_, K_NEIGH),
    "neigh_id": neighborer.ids_[neighborer.neigh_idx.ravel()]
})

# 3) cross-join to assign dates to each edge
edges_date = df_center.merge(edges, on="geom_id")

# 4) join lag columns for each neighbour
lag_cols = [f"no2_mean_lag{i}" for i in range(1, NLAGS+1)]
df_lags  = df[["geom_id","date"] + lag_cols]

# this merge will create geom_id_x (center) and geom_id_y (neigh)
df_nei   = edges_date.merge(
    df_lags,
    left_on=["neigh_id","date"],
    right_on=["geom_id","date"],
    how="left"
)

# 5) rename & drop so we group by the *center* geom_id
df_nei = (
    df_nei
    .rename(columns={"geom_id_x": "geom_id"})   # center id
    .drop(columns=["geom_id_y", "neigh_id"])    # drop the neighbour id & duplicate
)

# 6) aggregate the neighbour lags
neigh_feats = (
    df_nei
    .groupby(["geom_id","date"])[lag_cols]
    .mean()
    .rename(columns=lambda c: f"neigh_{c}")
    .reset_index()
)

# 7) merge back onto the full df
df_full = df.merge(neigh_feats, on=["geom_id","date"], how="left")
print("Shape with neighbour feats:", df_full.shape)


Shape with neighbour feats: (341292, 28)


#### Train/test split

In [5]:
# ─── Train/Test Split ───────────────────────────────────────────────────────
df_full["year"] = df_full["date"].dt.year

train = df_full[df_full["year"] < 2024].dropna(subset=["no2_mean"])
test  = df_full[df_full["year"] >= 2024].dropna(subset=["no2_mean"])

drop_cols = ["no2_mean","geometry","date","year"]
X_cols    = [c for c in train.columns if c not in drop_cols]

X_train, y_train = train[X_cols], train["no2_mean"]
X_test,  y_test  = test [X_cols],  test ["no2_mean"]

print(f"Training on {len(X_train)} rows; evaluating on {len(X_test)} rows")
print("Num features:", len(X_cols))


Training on 171486 rows; evaluating on 163254 rows
Num features: 25


### Random Forest Pipeline

In [6]:
pipeline = train_rf_pipeline(X_train, y_train)
metrics  = evaluate_model(pipeline, X_test, y_test)

print("OOB R²:   ", metrics["oob_r2"])
print("Test R²:  ", metrics["test_r2"])
print("Test RMSE:", metrics["test_rmse"])

OOB R²:    0.73713995869615
Test R²:   0.18441785334598337
Test RMSE: 1.9305755239440867e-05


### Fast SHAP on a sub-sample

In [15]:
X_bg   = X_train.sample(1000, random_state=0)
X_expl = X_test .sample( 800, random_state=1)

explainer, shap_vals = explain_shap(pipeline, X_bg, X_expl, max_display=15)

TypeError: The shap_values argument must be an Explanation object, Cohorts object, or dictionary of Explanation objects!

### Dependence plot for top-2 features, and mapping one SHAP feature back to space


In [None]:
# ─── Dependence Plots for Top-2 Features ─────────────────────────────────────
plot_shap_dependence(explainer, shap_vals, X_expl, top_k=2)

# ─── Map Mean SHAP of Primary Driver ────────────────────────────────────────
import matplotlib.pyplot as plt

# pick the single top feature
mean_abs = np.abs(shap_vals).mean(0)
feat_idx = np.argmax(mean_abs)
top_feat = X_expl.columns[feat_idx]

shap_df   = pd.DataFrame(shap_vals, columns=X_expl.columns, index=X_expl.index)
shap_df["geom_id"] = test.loc[X_expl.index, "geom_id"]
mean_shap = shap_df.groupby("geom_id")[top_feat].mean().reset_index()

map_gdf = static.merge(mean_shap, on="geom_id")
map_gdf.plot(column=top_feat, legend=True, cmap="plasma")
plt.title(f"Mean SHAP: {top_feat}")
plt.show()

### Approximate elasticities from SHAP


In [None]:
# ─── Approximate Elasticities from SHAP ──────────────────────────────────────
elas_df = compute_elasticities_shap(pipeline, X_expl, shap_vals)
display(elas_df.head(10))