# NO2 and economic activity model

## Addis-Ababa Random Forest + SHAP
Here we load **all 730** daily meshes for Addis, build lag & neighbor features, train a global RF, and visualize SHAP.


In [1]:
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np

# bring src/ into path
CURR_PATH = Path().resolve()
REPO_PATH = CURR_PATH.parent
sys.path.append(str(REPO_PATH / "src"))

# uppercase constants
ADDIS_FOLDER = Path(
    r"C:\Users\Luis.ParraMorales\AirPollution_Analysis"
    r"\air-pollution-mobility-research-project\data"
    r"\Populated meshes\addis-mesh-data"
)

# modeling imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import shap

# our feature-engineering helpers
from feature_engineering import (
    load_mesh_series, make_lag_features, NeighborAggregator
)

### Load all daily meshes into one GeoDataFrame


In [2]:
gdf = load_mesh_series(ADDIS_FOLDER)
print("Total rows:", len(gdf))
gdf.head()

Total rows: 399126


Unnamed: 0,geom_id,no2_mean,geometry,date
0,0,5.1e-05,"POLYGON ((38.78925 8.83942, 38.78925 8.84841, ...",2023-01-01
1,1,3.3e-05,"POLYGON ((38.79824 8.83942, 38.79824 8.84841, ...",2023-01-01
2,2,3.3e-05,"POLYGON ((38.80722 8.83942, 38.80722 8.84841, ...",2023-01-01
3,3,3.3e-05,"POLYGON ((38.8162 8.83942, 38.8162 8.84841, 38...",2023-01-01
4,4,3.3e-05,"POLYGON ((38.82519 8.83942, 38.82519 8.84841, ...",2023-01-01


### Create autoregressive lags 1…14 days


In [3]:
df = make_lag_features(gdf, nlags=14)
# drop the first 14 days per cell (NaN lags)
df = df.dropna(subset=[f"no2_mean_lag{l}" for l in range(1,15)])
df.shape

(298158, 18)

### Precompute spatial neighbors and add mean-lag features


In [4]:
# we only need one GeoDataFrame (static geometry & id)
static = gdf.drop_duplicates(["geom_id"])[["geom_id","geometry"]].set_index("geom_id")
neighborer = NeighborAggregator(k=8, id_col="geom_id")
neighborer.fit(static.reset_index(), None)

# apply to each row-group: transform expects the full static+lags DataFrame per time slice
# so merge geometry into df and then extract neighbor features
df_merged = (
    df.merge(static.reset_index(), on="geom_id")
      .reset_index(drop=True)
)
neigh_feats = neighborer.transform(df_merged)
# join back
df_full = pd.concat([df_merged.reset_index(drop=True), neigh_feats], axis=1)

### Split into training (all 2023) vs test (last quarter 2024)


In [6]:
df_full["year"] = df_full["date"].dt.year
train = df_full[df_full["year"] < 2024]
test  = df_full[df_full["year"] >= 2024]

y_train = train["no2_mean"]
y_test  = test["no2_mean"]
cols_to_drop = [col for col in ["no2_mean", "geometry", "date", "year"] if col in train.columns]
X_train = train.drop(columns=cols_to_drop)
X_test  = test.drop(columns=cols_to_drop)

### Random Forest with scaling


In [15]:
# Drop rows where the target is missing
train = train.dropna(subset=["no2_mean"])
test = test.dropna(subset=["no2_mean"])

y_train = train["no2_mean"]
y_test  = test["no2_mean"]
X_train = train.drop(columns=["no2_mean", "geometry", "geometry_x", "geometry_y", "date", "year"], errors="ignore")
X_test  = test.drop(columns=["no2_mean", "geometry", "geometry_x", "geometry_y", "date", "year"], errors="ignore")


In [12]:
# See the actual column names and types
for col in X_train.columns:
    print(col, type(X_train.iloc[0][col]))


geom_id <class 'numpy.int64'>
geometry_x <class 'shapely.geometry.polygon.Polygon'>
no2_mean_lag1 <class 'numpy.float64'>
no2_mean_lag2 <class 'numpy.float64'>
no2_mean_lag3 <class 'numpy.float64'>
no2_mean_lag4 <class 'numpy.float64'>
no2_mean_lag5 <class 'numpy.float64'>
no2_mean_lag6 <class 'numpy.float64'>
no2_mean_lag7 <class 'numpy.float64'>
no2_mean_lag8 <class 'numpy.float64'>
no2_mean_lag9 <class 'numpy.float64'>
no2_mean_lag10 <class 'numpy.float64'>
no2_mean_lag11 <class 'numpy.float64'>
no2_mean_lag12 <class 'numpy.float64'>
no2_mean_lag13 <class 'numpy.float64'>
no2_mean_lag14 <class 'numpy.float64'>
geometry_y <class 'shapely.geometry.polygon.Polygon'>
neigh_no2_mean_lag1 <class 'numpy.float64'>
neigh_no2_mean_lag2 <class 'numpy.float64'>
neigh_no2_mean_lag3 <class 'numpy.float64'>
neigh_no2_mean_lag4 <class 'numpy.float64'>
neigh_no2_mean_lag5 <class 'numpy.float64'>
neigh_no2_mean_lag6 <class 'numpy.float64'>
neigh_no2_mean_lag7 <class 'numpy.float64'>
neigh_no2_mean_la

In [16]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestRegressor(
        n_estimators=300, 
        n_jobs=-1, 
        random_state=42,
        oob_score=True
    ))
])

pipeline.fit(X_train, y_train)
print("OOB R²:", pipeline.named_steps["rf"].oob_score_)
print("Test R²:", pipeline.score(X_test, y_test))

OOB R²: 0.9293340411741373
Test R²: 0.2123083860017977


### Global feature importance (mean |SHAP|) and beeswarm


In [None]:
explainer = shap.TreeExplainer(pipeline.named_steps["rf"])
shap_values = explainer.shap_values(X_test)

# bar plot
shap.plots.bar(explainer, max_display=15)
# beeswarm
shap.summary_plot(shap_values, X_test, max_display=15)


### Dependence plot for top-2 features, and mapping one SHAP feature back to space


In [None]:
top_feats = X_test.columns[np.argsort(np.abs(shap_values).mean(0))[-2:]]
for feat in top_feats:
    shap.dependence_plot(feat, shap_values, X_test)

# map per-cell mean SHAP of first feature
mean_shap = pd.DataFrame({
    "geom_id": test["geom_id"],
    "shap1": shap_values[:, X_test.columns.get_loc(top_feats[-1])]
}).groupby("geom_id").mean().reset_index()

map_gdf = static.reset_index().merge(mean_shap, on="geom_id")
map_gdf.plot(column="shap1", legend=True, cmap="plasma")


### Approximate elasticities from SHAP


In [None]:
elasticities = []
for feat in X_test.columns:
    dx = X_test[feat].quantile(0.75) - X_test[feat].quantile(0.25)
    r = (shap_values[:, X_test.columns.get_loc(feat)] / pipeline.predict(X_test)) \
        * (X_test[feat] / dx)
    elasticities.append({
        "feature": feat,
        "median": np.median(r),
        "p10": np.percentile(r,10),
        "p90": np.percentile(r,90)
    })

pd.DataFrame(elasticities).sort_values("median", ascending=False).head(10)
