In [1]:
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

# Inputs
MODEL_PATH = Path("random_forrest_model.joblib")
DATA_PATH = Path("data_forrest.parquet")

CLUSTERS_TO_MODEL = [0, 1]

VAL_START = pd.to_datetime("2018-10-01")
VAL_END   = pd.to_datetime("2018-11-01")  # exclusive
TEST_START = pd.to_datetime("2018-11-01")

# Features must match training
FEATURES = [
    "cluster_id", "hour", "day_of_week", "month",
    "is_weekend", "is_holiday", "lag_24h", "lag_168h"
]

# Outputs
ARTIFACTS_DIR = Path("artifacts_rf")
PRED_DIR = Path("preds_rf")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
assert MODEL_PATH.exists(), f"Missing model: {MODEL_PATH}"
assert DATA_PATH.exists(), f"Missing data: {DATA_PATH}"

loaded = joblib.load(MODEL_PATH)
rf_pickups = loaded.get("pickups", None)
rf_dropoffs = loaded.get("dropoffs", None)  # may be None

if rf_pickups is None:
    raise ValueError("random_forrest_model.joblib does not contain key 'pickups'.")

df = pd.read_parquet(DATA_PATH).copy()

# Expect either:
# - datetime column (hourly timestamp), OR separate date/hour
if "datetime" not in df.columns:
    raise ValueError("data_forrest.parquet must contain a 'datetime' column.")

df["datetime"] = pd.to_datetime(df["datetime"])
df["date"] = df["datetime"].dt.normalize()
df["hour"] = df["datetime"].dt.hour  # ensure hour exists/consistent

missing = [c for c in FEATURES if c not in df.columns]
if missing:
    raise ValueError(f"Missing required feature columns: {missing}")

# Ground truth columns needed:
# pickups is required; dropoffs if you want arrivals too
if "pickups" not in df.columns:
    raise ValueError("data_forrest.parquet must contain ground truth column 'pickups'.")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
def make_split(df_):
    val = df_[(df_["datetime"] >= VAL_START) & (df_["datetime"] < VAL_END)].copy()
    test = df_[df_["datetime"] >= TEST_START].copy()
    return val, test

def predict_block(model, block: pd.DataFrame, target_name: str, pred_col: str):
    X = block[FEATURES]
    block[pred_col] = model.predict(X)
    # absolute error if ground truth exists
    if target_name in block.columns:
        block[f"ae_{pred_col}"] = (block[target_name] - block[pred_col]).abs()
    return block

def export_cluster(df, cluster_id: int):
    dfi = df[df["cluster_id"] == cluster_id].copy()
    val, test = make_split(dfi)

    if len(val) == 0:
        raise ValueError(f"Cluster {cluster_id}: empty October validation slice.")
    if len(test) == 0:
        print(f"[WARN] Cluster {cluster_id}: empty test slice (Nov-Dec).")

    # --- VAL predictions (for MAE/weight) ---
    val = predict_block(rf_pickups, val, "pickups", "y_pred_rf_pickups")

    mae_val_pickups = mean_absolute_error(val["pickups"], val["y_pred_rf_pickups"])

    # dropoffs optional
    mae_val_dropoffs = None
    if rf_dropoffs is not None and "dropoffs" in val.columns:
        val = predict_block(rf_dropoffs, val, "dropoffs", "y_pred_rf_dropoffs")
        mae_val_dropoffs = mean_absolute_error(val["dropoffs"], val["y_pred_rf_dropoffs"])

    # --- TEST predictions ---
    if len(test) > 0:
        test = predict_block(rf_pickups, test, "pickups", "y_pred_rf_pickups")

        if rf_dropoffs is not None and "dropoffs" in test.columns:
            test = predict_block(rf_dropoffs, test, "dropoffs", "y_pred_rf_dropoffs")

    # Standardize output columns to match MLP parquet schema as closely as possible
    def finalize(block, split_name):
        out = pd.DataFrame({
            "date": block["date"].values,
            "hour": block["hour"].values,
            "cluster_id": block["cluster_id"].values,
            "split": split_name,
            "y_true_pickups": block["pickups"].values,
            "y_pred_rf_pickups": block["y_pred_rf_pickups"].values,
        })
        out["ae_pickups_rf"] = (out["y_true_pickups"] - out["y_pred_rf_pickups"]).abs()

        # dropoffs if available
        if "dropoffs" in block.columns and "y_pred_rf_dropoffs" in block.columns:
            out["y_true_dropoffs"] = block["dropoffs"].values
            out["y_pred_rf_dropoffs"] = block["y_pred_rf_dropoffs"].values
            out["ae_dropoffs_rf"] = (out["y_true_dropoffs"] - out["y_pred_rf_dropoffs"]).abs()
            out["ae_mean_rf"] = 0.5 * (out["ae_pickups_rf"] + out["ae_dropoffs_rf"])
        else:
            out["y_true_dropoffs"] = np.nan
            out["y_pred_rf_dropoffs"] = np.nan
            out["ae_dropoffs_rf"] = np.nan
            out["ae_mean_rf"] = out["ae_pickups_rf"]  # fallback if only pickups
        return out

    df_val_out = finalize(val, "val")
    df_test_out = finalize(test, "test") if len(test) > 0 else pd.DataFrame(columns=df_val_out.columns)

    df_out = pd.concat([df_val_out, df_test_out], ignore_index=True)

    parquet_path = PRED_DIR / f"rf_cluster_{cluster_id}_preds.parquet"
    df_out.to_parquet(parquet_path, index=False)

    summary = {
        "cluster_id": cluster_id,
        "val_mae_pickups": float(mae_val_pickups),
        "val_mae_dropoffs": None if mae_val_dropoffs is None else float(mae_val_dropoffs),
        "val_mae_mean": float(df_val_out["ae_mean_rf"].mean()),
        "parquet_path": str(parquet_path),
    }
    return summary

summaries = []
for cid in CLUSTERS_TO_MODEL:
    try:
        summaries.append(export_cluster(df, cid))
        print(f"[OK] Exported cluster {cid}")
    except ValueError as e:
        print(f"[WARN] {e}")

df_summary = pd.DataFrame(summaries)
df_summary


[OK] Exported cluster 0
[OK] Exported cluster 1


Unnamed: 0,cluster_id,val_mae_pickups,val_mae_dropoffs,val_mae_mean,parquet_path
0,0,6.126288,6.256721,6.191504,preds_rf/rf_cluster_0_preds.parquet
1,1,16.897902,18.168927,17.533415,preds_rf/rf_cluster_1_preds.parquet


In [4]:
if len(df_summary) == 0:
    raise ValueError("No clusters exported; cannot compute general MAE/weight.")

general_mae_rf = float(df_summary["val_mae_mean"].mean())
eps = 1e-6
general_weight_rf = 1.0 / (general_mae_rf + eps)  # raw weight; normalize later across models

weight_info = {
    "model": "RandomForest",
    "clusters": CLUSTERS_TO_MODEL,
    "general_mae_val_oct": general_mae_rf,
    "general_weight_raw": general_weight_rf,
    "val_start": str(VAL_START.date()),
    "val_end": str(VAL_END.date()),
    "features": FEATURES,
    "has_dropoffs_model": rf_dropoffs is not None,
}

weights_path = ARTIFACTS_DIR / "rf_general_weight.json"
weights_path.write_text(json.dumps(weight_info, indent=2))
print(f"[INFO] RF general Oct MAE: {general_mae_rf:.6f}")
print(f"[INFO] RF raw weight: {general_weight_rf:.6f}")
print(f"[INFO] Saved: {weights_path}")

# Optional: copy model artifact into artifacts_rf for consistency
joblib.dump(loaded, ARTIFACTS_DIR / "random_forrest_model.joblib")


[INFO] RF general Oct MAE: 11.862459
[INFO] RF raw weight: 0.084300
[INFO] Saved: artifacts_rf/rf_general_weight.json


['artifacts_rf/random_forrest_model.joblib']