# US Accidents — SEMMA (Colab Lite Optimized)


**Focus:** A compute-aware SEMMA pipeline runnable on Colab free tier.

**Key choices**
- Chunked CSV sampling (≈1% pool → stratify to **50k** rows)
- Hash encoding for high-cardinality features
- Top-128 feature selection
- Small ensembles and tiny tuning grids


## 0) Environment (installs)

In [None]:

#@title Install minimal dependencies (quiet)
%%capture
import sys, subprocess, pkgutil
def pip_install(pkg):
    if pkg not in {m.name for m in pkgutil.iter_modules()}:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
pip_install("kaggle"); pip_install("opendatasets"); pip_install("category_encoders"); pip_install("pyarrow")


## 1) Data Access & Chunked Sampling (SEMMA — S)

In [None]:

#@title Download data and build a 50k stratified sample (chunked)
from pathlib import Path
import os, glob, shutil, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

BASE = Path("/content")
DATA = BASE/"data"; RAW = DATA/"raw"; SAMPLES = DATA/"samples"
for p in [DATA, RAW, SAMPLES]: p.mkdir(parents=True, exist_ok=True)

# Download via opendatasets (Kaggle auth required in browser)
try:
    import opendatasets as od
    od.download("https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents", data_dir="/content/us-accidents")
except Exception as e:
    print("If download failed, ensure Kaggle TOS accepted. Error:", e)

# Pick largest US_Accidents*.csv
cands = sorted(glob.glob("/content/us-accidents/**/*.csv", recursive=True)) + sorted(glob.glob("/content/us-accidents/*.csv"))
CSV = None; size = -1
for c in cands:
    name = os.path.basename(c)
    if "US_Accidents" in name and name.lower().endswith(".csv"):
        s = os.path.getsize(c)
        if s > size: CSV, size = c, s
assert CSV is not None, "US_Accidents*.csv not found."
shutil.copy(CSV, RAW/f"{Path(CSV).name}")
print("Using:", CSV)

# Chunked sampling: 1% pool -> stratify to 50k
TARGET_SAMPLE_SIZE = 50_000
POOL_MULTIPLIER = 1.8
POOL_TARGET = int(TARGET_SAMPLE_SIZE * POOL_MULTIPLIER)
P_CHUNK = 0.01
CHUNK_SIZE = 200_000

usecols = ["ID","Severity","Start_Time","End_Time","Start_Lat","Start_Lng","Distance(mi)",
           "City","County","State","Zipcode","Timezone","Temperature(F)","Humidity(%)","Pressure(in)",
           "Visibility(mi)","Wind_Speed(mph)","Wind_Direction","Precipitation(in)","Weather_Condition",
           "Amenity","Bump","Crossing","Junction","Traffic_Signal","Sunrise_Sunset"]
dtypes = {"ID":"string","Severity":"Int8","Start_Lat":"float32","Start_Lng":"float32","Distance(mi)":"float32",
          "City":"string","County":"string","State":"string","Zipcode":"string","Timezone":"string",
          "Temperature(F)":"float32","Humidity(%)":"float32","Pressure(in)":"float32","Visibility(mi)":"float32",
          "Wind_Speed(mph)":"float32","Wind_Direction":"string","Precipitation(in)":"float32","Weather_Condition":"string",
          "Amenity":"boolean","Bump":"boolean","Crossing":"boolean","Junction":"boolean","Traffic_Signal":"boolean",
          "Sunrise_Sunset":"string"}

pool_parts=[]; acc=0
reader = pd.read_csv(CSV, usecols=usecols, dtype=dtypes, parse_dates=["Start_Time","End_Time"], chunksize=CHUNK_SIZE)
for i,chunk in enumerate(reader):
    chunk = chunk.dropna(subset=["Severity","Start_Time","End_Time"]).copy()
    dur = (chunk["End_Time"] - chunk["Start_Time"]).dt.total_seconds()/60.0
    chunk = chunk[dur >= 0].copy()
    chunk["Duration_min"] = dur[dur >= 0]
    m = np.random.random(len(chunk)) < P_CHUNK
    take = chunk.loc[m]
    pool_parts.append(take); acc += len(take)
    print(f"Chunk {i}: took {len(take):,}, pool={acc:,}")
    if acc >= POOL_TARGET: break

pool = pd.concat(pool_parts, ignore_index=True)
sss = StratifiedShuffleSplit(n_splits=1, test_size=TARGET_SAMPLE_SIZE, random_state=42)
_, idx = next(sss.split(pool, pool["Severity"].astype(int)))
sample = pool.iloc[idx].reset_index(drop=True)

SAMPLE_PARQUET = SAMPLES/"us_accidents_sample_50k.parquet"
sample.to_parquet(SAMPLE_PARQUET, index=False)
print("Saved sample:", SAMPLE_PARQUET, " shape:", sample.shape)


## 2) Preprocessing (FeatureBuilder + ColumnTransformer)

In [None]:

#@title Build the lean preprocessing pipeline
import numpy as np, pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

class FeatureBuilder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        X["hour"] = X["Start_Time"].dt.hour.astype("int16")
        X["dow"]  = X["Start_Time"].dt.dayofweek.astype("int16")
        X["month"]= X["Start_Time"].dt.month.astype("int16")
        X["is_weekend"] = X["dow"].isin([5,6]).astype("int8")
        X["rush_hour"]  = X["hour"].isin([7,8,9,16,17,18]).astype("int8")
        w = X["Weather_Condition"].fillna("Unknown").str.lower()
        def bucket_weather(s):
            cat = np.full(len(s), "other/unknown", dtype=object)
            m = s.str.contains
            cat[m(r"(snow|sleet|blizzard|flurries|wintry)")] = "snow"
            cat[m(r"(ice|freez)")] = "ice"
            cat[m(r"(rain|drizzle|shower|storm|thunder|t\-?storm)")] = "rain/storm"
            cat[m(r"(fog|mist|haze|smoke)")] = "fog/mist"
            cat[m(r"(clear|fair)")] = "clear"
            cat[m(r"(cloud)")] = "cloudy"
            cat[m(r"(wind)")] = "windy"
            return pd.Series(cat, index=s.index)
        X["weather_bucket"] = bucket_weather(w).astype("string")
        X["precip_bucket"] = pd.cut(X["Precipitation(in)"], [-0.001,0.0,0.1,0.5,np.inf],
                                    labels=["none","(0,0.1]","(0.1,0.5]",">0.5"], include_lowest=True).astype("string")
        X["vis_bucket"] = pd.cut(X["Visibility(mi)"], [-0.001,1,3,5,10,np.inf],
                                 labels=["<=1","(1,3]","(3,5]","(5,10]","(10,inf)"], include_lowest=True).astype("string")
        def map_wind_dir(x):
            if pd.isna(x): return "Unknown"
            x = str(x).upper()
            if x in {"CALM","VAR","VARIABLE","VRB"}: return "Variable"
            mapping = {"N":"N","NNE":"N","NNW":"N","NE":"NE","ENE":"NE","E":"E","ESE":"E",
                       "SE":"SE","SSE":"SE","S":"S","SSW":"S","SW":"SW","WSW":"SW","W":"W",
                       "WNW":"W","NW":"NW","NNW":"NW"}
            return mapping.get(x, x if x in {"N","NE","E","SE","S","SW","W","NW"} else "Other")
        X["wind8"] = X["Wind_Direction"].map(map_wind_dir).astype("string")
        for b in ["Amenity","Bump","Crossing","Junction","Traffic_Signal"]:
            X[b] = X[b].fillna(False).astype("int8")
        return X

class WinsorClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.01, upper=0.99): self.lower=lower; self.upper=upper
    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        self.lo_ = np.nanpercentile(X, self.lower*100, axis=0)
        self.hi_ = np.nanpercentile(X, self.upper*100, axis=0); return self
    def transform(self, X): return np.clip(np.asarray(X, dtype=float), self.lo_, self.hi_)

num_cols = ["Distance(mi)","Temperature(F)","Humidity(%)","Pressure(in)","Visibility(mi)","Wind_Speed(mph)",
            "Precipitation(in)","Start_Lat","Start_Lng","hour","dow","month","is_weekend","rush_hour"]
bin_cols = ["Amenity","Bump","Crossing","Junction","Traffic_Signal"]
low_card = ["State","Timezone","Sunrise_Sunset","wind8","weather_bucket","precip_bucket","vis_bucket"]
high_card= ["City","County","Zipcode"]

import category_encoders as ce
high_cat = Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                     ("hash", ce.HashingEncoder(n_components=2**10, drop_invariant=True, return_df=True))])
low_cat  = Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                     ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))])
num_pipe = Pipeline([("impute", SimpleImputer(strategy="median")),
                     ("winsor", WinsorClipper(0.01,0.99)),
                     ("scale", RobustScaler())])

from sklearn.compose import ColumnTransformer
preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("bin", "passthrough", bin_cols),
    ("lowcat", low_cat, low_card),
    ("highcat", high_cat, high_card)
], remainder="drop", sparse_threshold=0.3)

from sklearn.pipeline import Pipeline
preprocess_pipe = Pipeline([("features", FeatureBuilder()), ("preprocess", preprocess)])

print("Preprocess ready.")


## 3) Feature Selection & Temporal Split (SEMMA — M2)

In [None]:

#@title Temporal split, MI<=15k, embedded selection -> lean 128 features
import numpy as np, pandas as pd
from scipy import sparse

df = pd.read_parquet("/content/data/samples/us_accidents_sample_50k.parquet")
def temporal_split(df, time_col="Start_Time", test_frac=0.20, valid_frac=0.10):
    df = df.sort_values(time_col).reset_index(drop=True)
    n = len(df); nt = int(n*test_frac); nv = int(n*valid_frac)
    return df.iloc[:n-nt-nv].copy(), df.iloc[n-nt-nv:n-nt].copy(), df.iloc[n-nt:].copy()

df_train, df_valid, df_test = temporal_split(df)
for part in (df_train, df_valid, df_test): part["y_reg"] = np.log1p(part["Duration_min"])

def drop_leakage(d): return d.drop(columns=["Severity","End_Time","Duration_min","y_reg"], errors="ignore")
Xc_train, yc_train = drop_leakage(df_train), df_train["Severity"].astype(int).values
Xc_valid, yc_valid = drop_leakage(df_valid), df_valid["Severity"].astype(int).values
Xc_test,  yc_test  = drop_leakage(df_test),  df_test["Severity"].astype(int).values
Xr_train, yr_train = drop_leakage(df_train), df_train["y_reg"].values
Xr_valid, yr_valid = drop_leakage(df_valid), df_valid["y_reg"].values
Xr_test,  yr_test  = drop_leakage(df_test),  df_test["y_reg"].values

Xc_train_t = preprocess_pipe.fit_transform(Xc_train, yc_train); Xc_valid_t = preprocess_pipe.transform(Xc_valid); Xc_test_t  = preprocess_pipe.transform(Xc_test)
Xr_train_t = preprocess_pipe.fit_transform(Xr_train, yr_train); Xr_valid_t = preprocess_pipe.transform(Xr_valid); Xr_test_t  = preprocess_pipe.transform(Xr_test)

def nzv_mask(X, thresh=1e-9):
    if sparse.issparse(X):
        var = np.array((X.power(2)).mean(axis=0) - np.square(X.mean(axis=0))).ravel()
    else:
        var = X.var(axis=0)
    return var > thresh

mask_c = nzv_mask(Xc_train_t); mask_r = nzv_mask(Xr_train_t)
def mapply(X, mask): import numpy as np; return X[:, np.where(mask)[0]]
Xc_train_t, Xc_valid_t, Xc_test_t = mapply(Xc_train_t, mask_c), mapply(Xc_valid_t, mask_c), mapply(Xc_test_t, mask_c)
Xr_train_t, Xr_valid_t, Xr_test_t = mapply(Xr_train_t, mask_r), mapply(Xr_valid_t, mask_r), mapply(Xr_test_t, mask_r)

from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
n_mi = min(Xc_train_t.shape[0], 15000)
mi_c = mutual_info_classif(Xc_train_t[:n_mi], yc_train[:n_mi], random_state=42)
mi_r = mutual_info_regression(Xr_train_t[:n_mi], yr_train[:n_mi], random_state=42)

TOP_K = 128
top_c_idx = np.argsort(mi_c)[::-1][:TOP_K]; top_r_idx = np.argsort(mi_r)[::-1][:TOP_K]
Xc_train_mi, Xc_valid_mi, Xc_test_mi = Xc_train_t[:, top_c_idx], Xc_valid_t[:, top_c_idx], Xc_test_t[:, top_c_idx]
Xr_train_mi, Xr_valid_mi, Xr_test_mi = Xr_train_t[:, top_r_idx], Xr_valid_t[:, top_r_idx], Xr_test_t[:, top_r_idx]

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
logit_l1 = LogisticRegression(penalty="l1", solver="liblinear", C=0.7, class_weight="balanced", max_iter=200).fit(Xc_train_mi, yc_train)
coef_c = np.mean(np.abs(logit_l1.coef_), axis=0)
lasso = Lasso(alpha=0.001, max_iter=1500).fit(Xr_train_mi, yr_train); coef_r = np.abs(lasso.coef_)
etc = ExtraTreesClassifier(n_estimators=100, random_state=42, class_weight="balanced_subsample", n_jobs=-1).fit(Xc_train_mi, yc_train); imp_c = etc.feature_importances_
etr = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1).fit(Xr_train_mi, yr_train); imp_r = etr.feature_importances_

def top_union(*arrs, k=128):
    ranks = [np.argsort(a)[::-1] for a in arrs]; seen=set(); out=[]
    for r in ranks:
        for idx in r:
            if idx not in seen:
                seen.add(idx); out.append(idx)
            if len(out)>=k: break
        if len(out)>=k: break
    return np.array(out)

lean_c_idx = top_union(mi_c[top_c_idx], imp_c, coef_c, k=128)
lean_r_idx = top_union(mi_r[top_r_idx], imp_r, coef_r, k=128)
Xc_train_lean, Xc_valid_lean, Xc_test_lean = Xc_train_mi[:, lean_c_idx], Xc_valid_mi[:, lean_c_idx], Xc_test_mi[:, lean_c_idx]
Xr_train_lean, Xr_valid_lean, Xr_test_lean = Xr_train_mi[:, lean_r_idx], Xr_valid_mi[:, lean_r_idx], Xr_test_mi[:, lean_r_idx]

print("Lean shapes — Cls:", Xc_train_lean.shape, Xc_valid_lean.shape, Xc_test_lean.shape, " Reg:", Xr_train_lean.shape, Xr_valid_lean.shape, Xr_test_lean.shape)


## 4) Modeling (baselines, tiny tuning, test)

In [None]:

#@title Fit baselines & small models; tiny tuning; test metrics
import numpy as np, pandas as pd
from sklearn.metrics import balanced_accuracy_score, f1_score, cohen_kappa_score, confusion_matrix
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, HuberRegressor, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from numpy import expm1, sqrt
from scipy.sparse import vstack, issparse

def cls_metrics(y_true, y_pred):
    return {"balanced_acc":balanced_accuracy_score(y_true, y_pred),
            "macro_f1":f1_score(y_true, y_pred, average="macro"),
            "qwk":cohen_kappa_score(y_true, y_pred, weights="quadratic"),
            "cm":confusion_matrix(y_true, y_pred, normalize="true")}

def reg_metrics(y_true_log, y_pred_log, y_true_minutes):
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    rmse_log = sqrt(mean_squared_error(y_true_log, y_pred_log))
    y_pred_min = expm1(y_pred_log)
    mae_min = mean_absolute_error(y_true_minutes, y_pred_min)
    return {"mae_log":mae_log, "rmse_log":rmse_log, "mae_min":mae_min}

# Baselines and first models
d_cls = DummyClassifier(strategy="most_frequent").fit(Xc_train_lean, yc_train)
d_reg = DummyRegressor(strategy="median").fit(Xr_train_lean, yr_train)
logit = LogisticRegression(max_iter=300, class_weight="balanced").fit(Xc_train_lean, yc_train)
rf_c  = RandomForestClassifier(n_estimators=200, max_depth=14, min_samples_leaf=3,
                               class_weight="balanced_subsample", n_jobs=-1, random_state=42).fit(Xc_train_lean, yc_train)
huber = HuberRegressor(alpha=1e-4, epsilon=1.5, max_iter=400).fit(Xr_train_lean, yr_train)
ridge = Ridge(alpha=1.0, random_state=42).fit(Xr_train_lean, yr_train)
rf_r  = RandomForestRegressor(n_estimators=200, max_depth=16, min_samples_leaf=3,
                              n_jobs=-1, random_state=42).fit(Xr_train_lean, yr_train)

print("Valid — Classification:", { "dummy": {k:round(v,4) for k,v in cls_metrics(yc_valid, d_cls.predict(Xc_valid_lean)).items() if k!='cm'},
       "logit": {k:round(v,4) for k,v in cls_metrics(yc_valid, logit.predict(Xc_valid_lean)).items() if k!='cm'},
       "rf":    {k:round(v,4) for k,v in cls_metrics(yc_valid, rf_c.predict(Xc_valid_lean)).items() if k!='cm'} })

print("Valid — Regression:", {
    "dummy": {k:round(v,4) for k,v in reg_metrics(yr_valid, d_reg.predict(Xr_valid_lean), df_valid['Duration_min'].values).items()},
    "huber": {k:round(v,4) for k,v in reg_metrics(yr_valid, huber.predict(Xr_valid_lean), df_valid['Duration_min'].values).items()},
    "ridge": {k:round(v,4) for k,v in reg_metrics(yr_valid, ridge.predict(Xr_valid_lean), df_valid['Duration_min'].values).items()},
    "rf":    {k:round(v,4) for k,v in reg_metrics(yr_valid, rf_r.predict(Xr_valid_lean), df_valid['Duration_min'].values).items()},
})

# Tiny tuning
sev_grid = {"C":[0.7,1.0], "rf_n":[150,250], "rf_d":[12,14]}
best_sev, best_qwk = rf_c, cohen_kappa_score(yc_valid, rf_c.predict(Xc_valid_lean), weights="quadratic")
for C in sev_grid["C"]:
    m = LogisticRegression(max_iter=350, class_weight="balanced", C=C).fit(Xc_train_lean, yc_train)
    q = cohen_kappa_score(yc_valid, m.predict(Xc_valid_lean), weights="quadratic")
    if q > best_qwk: best_sev, best_qwk = m, q
for n in sev_grid["rf_n"]:
    for d in sev_grid["rf_d"]:
        m = RandomForestClassifier(n_estimators=n, max_depth=d, min_samples_leaf=3,
                                   class_weight="balanced_subsample", n_jobs=-1, random_state=42).fit(Xc_train_lean, yc_train)
        q = cohen_kappa_score(yc_valid, m.predict(Xc_valid_lean), weights="quadratic")
        if q > best_qwk: best_sev, best_qwk = m, q

dur_grid = {"alpha":[0.7,1.0,3.0], "rf_n":[150,250], "rf_d":[14,16]}
best_dur, best_mae = rf_r, reg_metrics(yr_valid, rf_r.predict(Xr_valid_lean), df_valid['Duration_min'].values)["mae_min"]
for a in dur_grid["alpha"]:
    m = Ridge(alpha=a, random_state=42).fit(Xr_train_lean, yr_train)
    mae = reg_metrics(yr_valid, m.predict(Xr_valid_lean), df_valid['Duration_min'].values)["mae_min"]
    if mae < best_mae: best_dur, best_mae = m, mae
for n in dur_grid["rf_n"]:
    for d in dur_grid["rf_d"]:
        m = RandomForestRegressor(n_estimators=n, max_depth=d, min_samples_leaf=3, n_jobs=-1, random_state=42).fit(Xr_train_lean, yr_train)
        mae = reg_metrics(yr_valid, m.predict(Xr_valid_lean), df_valid['Duration_min'].values)["mae_min"]
        if mae < best_mae: best_dur, best_mae = m, mae

# Ordinal via thresholded regression (validation-only)
def qwk_thresholds(y_true, y_score):
    thr = np.percentile(y_score, [25,50,75]).astype(float)
    def disc(s,t): return np.clip(1 + (s>t[0]) + (s>t[1]) + (s>t[2]), 1, 4).astype(int)
    best = thr.copy(); best_q = cohen_kappa_score(y_true, disc(y_score,best), weights="quadratic")
    for step in [0.2,0.1,0.05,0.02]:
        improved=True
        while improved:
            improved=False
            for i in range(3):
                for delta in [-step, step]:
                    tr = best.copy(); tr[i]+=delta; tr=np.sort(tr)
                    q = cohen_kappa_score(y_true, disc(y_score,tr), weights="quadratic")
                    if q>best_q: best, best_q, improved = tr, q, True
    return best, best_q
reg_sev = Ridge(alpha=1.0, random_state=42).fit(Xc_train_lean, yc_train)
thr, q = qwk_thresholds(yc_valid, reg_sev.predict(Xc_valid_lean))
print("Ordinal ridge QWK (valid):", round(q,4), " thresholds:", thr)

# Train on train+valid and evaluate on test
def stack(X_tr, X_va, y_tr, y_va):
    X = vstack([X_tr, X_va]) if issparse(X_tr) else np.vstack([X_tr, X_va])
    y = np.concatenate([y_tr, y_va]); return X, y
Xc_trva, yc_trva = stack(Xc_train_lean, Xc_valid_lean, yc_train, yc_valid)
Xr_trva, yr_trva = stack(Xr_train_lean, Xr_valid_lean, yr_train, yr_valid)
best_sev.fit(Xc_trva, yc_trva); best_dur.fit(Xr_trva, yr_trva)

sev_pred_test = best_sev.predict(Xc_test_lean)
sev_test = {"qwk": cohen_kappa_score(yc_test, sev_pred_test, weights="quadratic"),
            "bal_acc": balanced_accuracy_score(yc_test, sev_pred_test),
            "macro_f1": f1_score(yc_test, sev_pred_test, average="macro")}
dur_pred_test_log = best_dur.predict(Xr_test_lean)
mae_min = mean_absolute_error(df_test["Duration_min"].values, expm1(dur_pred_test_log))
mae_log = mean_absolute_error(yr_test, dur_pred_test_log)
rmse_log = sqrt(mean_squared_error(yr_test, dur_pred_test_log))
print("TEST — Severity:", {k:round(v,4) for k,v in sev_test.items()})
print("TEST — Duration:", {"mae_min":round(mae_min,2), "mae_log":round(mae_log,4), "rmse_log":round(rmse_log,4)})


## 5) Packaging — Save models & inference helpers

In [None]:

#@title Save fitted models and provide simple inference helpers
import joblib, os, numpy as np
os.makedirs("/content/models", exist_ok=True)
joblib.dump(best_sev, "/content/models/severity_best.pkl")
joblib.dump(best_dur, "/content/models/duration_best.pkl")
print("Saved models in /content/models")

def preprocess_single_row(raw_row: dict, task="severity"):
    import pandas as pd, numpy as np
    X = pd.DataFrame([raw_row])
    Xt = preprocess_pipe.transform(X)
    from numpy import where
    if task=="severity":
        Xt = Xt[:, np.where(mask_c)[0]]; Xt = Xt[:, top_c_idx][:, lean_c_idx]
    else:
        Xt = Xt[:, np.where(mask_r)[0]]; Xt = Xt[:, top_r_idx][:, lean_r_idx]
    return Xt

def predict_severity(raw_row: dict):
    Xt = preprocess_single_row(raw_row, "severity"); return int(best_sev.predict(Xt)[0])

def predict_duration_minutes(raw_row: dict):
    Xt = preprocess_single_row(raw_row, "duration"); return float(np.expm1(best_dur.predict(Xt)[0]))
