# Baseline models for DeepDemand: Ridge (L2) and Random Forest

This notebook builds edge-level features from your existing data layout and evaluates:
- 5-fold random CV (k=5)
- 9-fold spatial CV (regions, fold_idx=1..9)

Metrics (train/test): MGEH, MAE, R² using your `model.utils` functions.


In [None]:
# path setting
import sys
from pathlib import Path
# set the notebook's CWD to your repo root
%cd D:/deepdemand
ROOT = Path.cwd().parents[0]   # go up one level
sys.path.insert(0, str(ROOT))

In [2]:
import os
import json
from collections import Counter

import numpy as np
import pandas as pd
import torch

from config import DATA, TRAINING
from model.dataloader import load_gt, load_json, get_lsoa_vector
import model.utils as utils


## 0) Reproducibility

In [3]:
np.random.seed(TRAINING['seed'])
torch.manual_seed(TRAINING['seed'])


<torch._C.Generator at 0x24d3839cd10>

## 1) Load GT and feature bank (same logic as your trainer)

In [4]:
# --- Load GT (already filtered + optionally normalized) ---
edge_to_gt, scaler = load_gt()
all_edge_ids = list(edge_to_gt.keys())
print('Edges:', len(all_edge_ids), 'Scaler:', type(scaler).__name__ if scaler else None)

# --- Load LSOA features JSON + node_to_lsoa mapping ---
lsoa_json = load_json(DATA['lsoa_json'])
node_to_lsoa = load_json('data/node_features/node_to_lsoa.json')

# --- Build feature_bank like your trainer ---
lsoa_codes = sorted(lsoa_json.keys())
feat_rows = []
for code in lsoa_codes:
    v = get_lsoa_vector(lsoa_json[code])  # torch tensor
    feat_rows.append(v.cpu().numpy())
X_lsoa = np.vstack(feat_rows).astype(np.float32)

feature_bank = {code: X_lsoa[i] for i, code in enumerate(lsoa_codes)}

feat_dim = X_lsoa.shape[1]
print('LSOA feature dim:', feat_dim)


Number of valid edges: 5088

=== GT Descriptive Statistics (raw) ===
Min     : 191.405
Max     : 113436.372
Mean    : 25243.410
Median  : 20618.627
Std     : 18893.461

Edges: 5088 Scaler: None
LSOA feature dim: 121


## 2) Edge-level feature construction (frequency-weighted)

For each edge_id, we read:
- `data/subgraphs/subgraphs/{edge_id}/od_use.feather` columns: O, D, t_OD

We build:
- XO: frequency-weighted mean of unique O nodes' LSOA vectors
- XD: frequency-weighted mean of unique D nodes' LSOA vectors
- t summary: mean, std, p10, p50, p90
- interaction: XO * XD (enabled)


In [5]:
from collections import Counter
import numpy as np
import os
import pandas as pd

SUBGRAPH_ROOT = 'data/subgraphs/subgraphs'

# T_STATS = ('mean', 'std', 'p10', 'p50', 'p90')
T_STATS = ('mean')
USE_INTERACTION = False

def _get_lsoa_vec_from_node(node_id_str: str) -> np.ndarray:
    lsoa_code = node_to_lsoa[str(node_id_str)][0]
    return feature_bank[lsoa_code]

def _compute_global_t_imputer(edge_ids: list) -> np.ndarray:
    feats = []
    empty_or_missing = 0

    for eid in edge_ids:
        fpath = os.path.join(SUBGRAPH_ROOT, eid, 'od_use.feather')
        if not os.path.exists(fpath):
            empty_or_missing += 1
            continue
        try:
            df = pd.read_feather(fpath, columns=['t_OD'])
        except Exception:
            empty_or_missing += 1
            continue
        if len(df) == 0:
            empty_or_missing += 1
            continue

        t = df['t_OD'].to_numpy(dtype=np.float32)
        t_feat = []
        if 'mean' in T_STATS: t_feat.append(float(np.mean(t)))
        if 'std'  in T_STATS: t_feat.append(float(np.std(t)))
        if 'p10'  in T_STATS: t_feat.append(float(np.percentile(t, 10)))
        if 'p50'  in T_STATS: t_feat.append(float(np.percentile(t, 50)))
        if 'p90'  in T_STATS: t_feat.append(float(np.percentile(t, 90)))
        feats.append(t_feat)

    if len(feats) == 0:
        print("[t_imputer] No non-empty od_use found. Using zeros for t_stats.")
        return np.zeros((len(T_STATS),), dtype=np.float32)

    feats = np.asarray(feats, dtype=np.float32)
    global_mean = feats.mean(axis=0).astype(np.float32)
    print(f"[t_imputer] Computed from {len(feats)} non-empty edges; "
          f"{empty_or_missing} empty/missing edges will use global mean t_stats.")
    return global_mean

def build_one_edge_feature(
    edge_id: str,
    t_imputer: np.ndarray,
    eps: float = 1e-6
) -> np.ndarray:
    """
    If od_use.feather is missing/empty:
      - XO, XD = zeros
      - t_feat = t_imputer
      - n_od = 0
    Else:
      - compute XO, XD, t_feat
      - n_od = len(df)
    Returns: concat([XO, XD, XO*XD (optional), t_feat, n_od])
    """
    fpath = os.path.join(SUBGRAPH_ROOT, edge_id, 'od_use.feather')

    # defaults for zero-OD edges
    XO = np.zeros((feat_dim,), dtype=np.float32)
    XD = np.zeros((feat_dim,), dtype=np.float32)
    t_feat = t_imputer.copy()
    n_od = 0.0

    if os.path.exists(fpath):
        try:
            df = pd.read_feather(fpath, columns=['O', 'D', 't_OD'])
        except Exception:
            df = None

        if df is not None and len(df) > 0:
            n_od = float(len(df))

            O = df['O'].astype(str).tolist()
            D = df['D'].astype(str).tolist()
            t = df['t_OD'].to_numpy(dtype=np.float32)

            # frequency weights
            cO = Counter(O)
            cD = Counter(D)

            uniq_O = list(dict.fromkeys(O))
            uniq_D = list(dict.fromkeys(D))

            wO = np.array([float(cO[n]) for n in uniq_O], dtype=np.float32)
            wD = np.array([float(cD[n]) for n in uniq_D], dtype=np.float32)
            wO = wO / (wO.sum() + eps)
            wD = wD / (wD.sum() + eps)

            XO = np.zeros((feat_dim,), dtype=np.float32)
            for n, w in zip(uniq_O, wO):
                XO += w * _get_lsoa_vec_from_node(n)

            XD = np.zeros((feat_dim,), dtype=np.float32)
            for n, w in zip(uniq_D, wD):
                XD += w * _get_lsoa_vec_from_node(n)

            # t stats (edge-specific)
            t_edge = []
            if 'mean' in T_STATS: t_edge.append(float(np.mean(t)))
            if 'std'  in T_STATS: t_edge.append(float(np.std(t)))
            if 'p10'  in T_STATS: t_edge.append(float(np.percentile(t, 10)))
            if 'p50'  in T_STATS: t_edge.append(float(np.percentile(t, 50)))
            if 'p90'  in T_STATS: t_edge.append(float(np.percentile(t, 90)))
            t_feat = np.asarray(t_edge, dtype=np.float32)

    parts = [XO, XD]
    if USE_INTERACTION:
        parts.append(XO * XD)
    parts.append(t_feat)
    # parts.append(np.array([n_od], dtype=np.float32))  # <-- add n_od as a feature
    return np.concatenate(parts, axis=0)

def build_dataset(edge_ids: list) -> tuple[np.ndarray, np.ndarray, list]:
    t_imputer = _compute_global_t_imputer(edge_ids)

    X_rows = []
    y_rows = []
    kept = []

    for eid in edge_ids:
        x = build_one_edge_feature(eid, t_imputer=t_imputer)
        X_rows.append(x)
        y_rows.append(float(edge_to_gt[eid]))
        kept.append(eid)

    X = np.vstack(X_rows).astype(np.float32)
    y = np.array(y_rows, dtype=np.float32)
    return X, y, kept

### Build full dataset once
We build features once and then just slice by fold IDs.

In [6]:
X_all, y_all, kept_edges = build_dataset(all_edge_ids)
print('Built X:', X_all.shape, 'y:', y_all.shape)

# Map edge_id -> row index
edge_to_idx = {eid: i for i, eid in enumerate(kept_edges)}


[t_imputer] Computed from 4584 non-empty edges; 504 empty/missing edges will use global mean t_stats.
Built X: (5088, 243) y: (5088,)


## 3) Models: Ridge (L2) and Random Forest

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor


In [8]:
def make_ridge(alpha: float = 1.0):
    return Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('ridge', Ridge(alpha=alpha, random_state=TRAINING['seed']))
    ])

def make_rf(n_estimators: int = 500, max_depth=None, n_jobs: int = -1):
    return RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=TRAINING['seed'],
        n_jobs=n_jobs,
    )


## 4) Metrics (use your exact definitions)

We compute metrics in the same way as your training loop:
- pass **normalized y** into utils and provide the scaler to invert (if enabled)


In [9]:
def eval_metrics(y_true_np: np.ndarray, y_pred_np: np.ndarray, scaler):
    # utils expects torch tensors
    yt = torch.tensor(y_true_np, dtype=torch.float32)
    yp = torch.tensor(y_pred_np, dtype=torch.float32)
    mae = utils.MAE(yt, yp, scaler).item()
    mgeh = utils.MGEH(yt, yp, scaler).item()
    r2 = utils.R_square(yt, yp, scaler).item()
    return {'MAE': mae, 'MGEH': mgeh, 'R2': r2}


## 5) Fold runners

- 5-fold CV: `utils.get_cv_split(..., k=5, fold_idx=0..4)`
- 9-fold spatial CV: `utils.get_spatial_cv_split(..., fold_idx=1..9)`


In [10]:
def ids_to_indices(ids: list[str]) -> np.ndarray:
    idx = [edge_to_idx[e] for e in ids if e in edge_to_idx]
    return np.array(idx, dtype=np.int64)

def run_cv(model_name: str, model_factory, split_type: str):
    results = []

    if split_type == 'kfold5':
        folds = list(range(5))
        for fold_idx in folds:
            train_ids, test_ids = utils.get_cv_split(
                kept_edges,
                k=5,
                fold_idx=fold_idx,
                seed=TRAINING['seed'],
            )
            tr = ids_to_indices(train_ids)
            te = ids_to_indices(test_ids)

            model = model_factory()
            model.fit(X_all[tr], y_all[tr])
            pred_tr = model.predict(X_all[tr])
            pred_te = model.predict(X_all[te])

            m_tr = eval_metrics(y_all[tr], pred_tr, scaler)
            m_te = eval_metrics(y_all[te], pred_te, scaler)

            results.append({
                'split': 'kfold5',
                'fold': fold_idx,
                'model': model_name,
                'train_MAE': m_tr['MAE'],
                'train_MGEH': m_tr['MGEH'],
                'train_R2': m_tr['R2'],
                'test_MAE': m_te['MAE'],
                'test_MGEH': m_te['MGEH'],
                'test_R2': m_te['R2'],
            })

    elif split_type == 'spatial9':
        folds = list(range(1, 10))  # 1..9
        for fold_idx in folds:
            train_ids, test_ids = utils.get_spatial_cv_split(
                kept_edges,
                fold_idx=fold_idx,
            )
            tr = ids_to_indices(train_ids)
            te = ids_to_indices(test_ids)

            model = model_factory()
            model.fit(X_all[tr], y_all[tr])
            pred_tr = model.predict(X_all[tr])
            pred_te = model.predict(X_all[te])

            m_tr = eval_metrics(y_all[tr], pred_tr, scaler)
            m_te = eval_metrics(y_all[te], pred_te, scaler)

            results.append({
                'split': 'spatial9',
                'fold': fold_idx,
                'model': model_name,
                'train_MAE': m_tr['MAE'],
                'train_MGEH': m_tr['MGEH'],
                'train_R2': m_tr['R2'],
                'test_MAE': m_te['MAE'],
                'test_MGEH': m_te['MGEH'],
                'test_R2': m_te['R2'],
            })
    else:
        raise ValueError('split_type must be kfold5 or spatial9')

    return pd.DataFrame(results)


## 6) Run baselines

In [11]:
# Ridge settings (tune alpha if you want)
RIDGE_ALPHA = 1.0

# RF settings (adjust if too slow)
RF_TREES = 500
RF_MAX_DEPTH = None

ridge_factory = lambda: make_ridge(alpha=RIDGE_ALPHA)
rf_factory    = lambda: make_rf(n_estimators=RF_TREES, max_depth=RF_MAX_DEPTH)

df_ridge_k5 = run_cv('Ridge', ridge_factory, 'kfold5')
df_rf_k5    = run_cv('RF',    rf_factory,    'kfold5')

df_ridge_sp = run_cv('Ridge', ridge_factory, 'spatial9')
df_rf_sp    = run_cv('RF',    rf_factory,    'spatial9')

df_all = pd.concat([df_ridge_k5, df_rf_k5, df_ridge_sp, df_rf_sp], ignore_index=True)
df_all.head()


[Spatial CV] Validation region: E12000001
[Spatial CV] #val_edges = 143, #train_edges = 4945
[Spatial CV] Validation region: E12000002
[Spatial CV] #val_edges = 608, #train_edges = 4480
[Spatial CV] Validation region: E12000003
[Spatial CV] #val_edges = 580, #train_edges = 4508
[Spatial CV] Validation region: E12000004
[Spatial CV] #val_edges = 426, #train_edges = 4662
[Spatial CV] Validation region: E12000005
[Spatial CV] #val_edges = 480, #train_edges = 4608
[Spatial CV] Validation region: E12000006
[Spatial CV] #val_edges = 667, #train_edges = 4421
[Spatial CV] Validation region: E12000007
[Spatial CV] #val_edges = 85, #train_edges = 5003
[Spatial CV] Validation region: E12000008
[Spatial CV] #val_edges = 957, #train_edges = 4131
[Spatial CV] Validation region: E12000009
[Spatial CV] #val_edges = 392, #train_edges = 4696
[Spatial CV] Validation region: E12000001
[Spatial CV] #val_edges = 143, #train_edges = 4945
[Spatial CV] Validation region: E12000002
[Spatial CV] #val_edges = 608

Unnamed: 0,split,fold,model,train_MAE,train_MGEH,train_R2,test_MAE,test_MGEH,test_R2
0,kfold5,0,Ridge,12218.013672,78.516602,0.318634,12903.316406,82.459572,0.188255
1,kfold5,1,Ridge,12085.897461,77.946404,0.316429,13157.112305,83.750801,0.251117
2,kfold5,2,Ridge,12275.780273,79.033234,0.306328,12551.689453,80.523827,0.272058
3,kfold5,3,Ridge,12182.669922,78.597954,0.312257,13024.542969,81.261765,0.147523
4,kfold5,4,Ridge,12158.456055,78.503624,0.323467,12968.589844,81.260834,0.163925


## 7) Summaries (mean ± std across folds)

In [12]:
def summarize(df: pd.DataFrame):
    metrics = ['train_MAE','train_MGEH','train_R2','test_MAE','test_MGEH','test_R2']
    g = df.groupby(['split','model'])[metrics]
    mean = g.mean().add_suffix('_mean')
    std  = g.std(ddof=1).add_suffix('_std')
    out = pd.concat([mean, std], axis=1).reset_index()
    return out

summary = summarize(df_all)
summary


Unnamed: 0,split,model,train_MAE_mean,train_MGEH_mean,train_R2_mean,test_MAE_mean,test_MGEH_mean,test_R2_mean,train_MAE_std,train_MGEH_std,train_R2_std,test_MAE_std,test_MGEH_std,test_R2_std
0,kfold5,RF,3096.523828,23.728201,0.942049,7453.121094,50.558841,0.700684,18.288058,0.175644,0.000818,155.723144,1.064574,0.02751
1,kfold5,Ridge,12184.163477,78.519563,0.315423,12921.050195,81.85136,0.204576,70.460601,0.386904,0.006496,226.62631,1.268512,0.054529
2,spatial9,RF,3015.36613,23.191407,0.943371,9408.919054,61.174131,0.503046,42.149083,0.180442,0.001937,1820.438511,7.477283,0.119772
3,spatial9,Ridge,12169.063585,78.4608,0.31215,13590.974175,86.1077,0.039495,228.011606,0.925877,0.0063,2461.714252,8.494967,0.32465


## 8) Save outputs

In [13]:
os.makedirs('eval/baselines', exist_ok=True)
df_all.to_csv('eval/baselines/baseline_ridge_rf_all_folds.csv', index=False)
summary.to_csv('eval/baselines/baseline_ridge_rf_summary.csv', index=False)
print('Saved:')
print(' - eval/baselines/baseline_ridge_rf_all_folds.csv')
print(' - eval/baselines/baseline_ridge_rf_summary.csv')


Saved:
 - eval/baselines/baseline_ridge_rf_all_folds.csv
 - eval/baselines/baseline_ridge_rf_summary.csv
