### 1. Setup and Configuration
Imports libraries, defines file paths, and sets time windows for validation (Nov) and testing (Dec).

In [1]:
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

MODEL_PATH = Path("Data/random_forrest_model.joblib")
DATA_PATH = Path("Data/data_forrest.parquet")

CLUSTERS_TO_MODEL = [0, 8]

VAL_START  = pd.Timestamp("2018-11-01 00:00:00")
VAL_END    = pd.Timestamp("2018-12-01 00:00:00")  # exclusive
TEST_START = pd.Timestamp("2018-12-01 00:00:00")
TEST_END   = pd.Timestamp("2018-12-31 23:00:00")

# Features must match training (but we'll recompute lag_24h / lag_168h ourselves)
FEATURES = [
    "cluster_id", "hour", "day_of_week", "month",
    "is_weekend", "is_holiday", "lag_24h", "lag_168h"
]

ARTIFACTS_DIR = Path("artifacts_rf")
PRED_DIR = Path("preds_rf")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)

EPS = 1e-6
print("[OK] Config set.")


[OK] Config set.


### 2. Data Loading and Preprocessing
Loads the pre-trained Random Forest model and raw data, filtering for specific clusters and ensuring sufficient history exists for lag generation.

In [2]:
assert MODEL_PATH.exists(), f"Missing model: {MODEL_PATH}"
assert DATA_PATH.exists(), f"Missing data: {DATA_PATH}"

loaded = joblib.load(MODEL_PATH)
rf_pickups = loaded.get("pickups", None)
rf_dropoffs = loaded.get("dropoffs", None)  # may be None

if rf_pickups is None:
    raise ValueError("random_forrest_model.joblib does not contain key 'pickups'.")

df = pd.read_parquet(DATA_PATH).copy()

if "datetime" not in df.columns:
    raise ValueError("data_forrest.parquet must contain a 'datetime' column.")

df["datetime"] = pd.to_datetime(df["datetime"]).dt.floor("h")
df["date"] = df["datetime"].dt.normalize()
df["hour"] = df["datetime"].dt.hour.astype(int)

# Must contain cluster_id + ground truth pickups
need_cols = {"cluster_id", "datetime", "pickups"}
missing = need_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Optional dropoffs
has_dropoffs_truth = "dropoffs" in df.columns
print("[INFO] dropoffs truth present:", has_dropoffs_truth)
print("[INFO] df rows:", len(df), "clusters:", sorted(df["cluster_id"].unique())[:10])

# Filter to modeled clusters and to required time span incl. needed history for lags
HIST_START = (VAL_START - pd.Timedelta(hours=168)).floor("h")
df = df[(df["cluster_id"].isin(CLUSTERS_TO_MODEL)) &
        (df["datetime"] >= HIST_START) &
        (df["datetime"] <= TEST_END)].copy()

print("[INFO] filtered df rows:", len(df), "from", df["datetime"].min(), "to", df["datetime"].max())


[INFO] dropoffs truth present: True
[INFO] df rows: 171840 clusters: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[INFO] filtered df rows: 3264 from 2018-10-25 00:00:00 to 2018-12-31 23:00:00


### 3. Feature Engineering
Reconstructs the hourly time grid to handle missing data, computes calendar features, and generates lag features (24h, 168h) dynamically to prevent data leakage.

In [3]:
def make_hour_grid(clusters, start, end):
    hours = pd.date_range(start, end, freq="h")
    grid = pd.MultiIndex.from_product([clusters, hours], names=["cluster_id", "datetime"]).to_frame(index=False)
    return grid

grid = make_hour_grid(CLUSTERS_TO_MODEL, HIST_START, TEST_END)

# Merge & fill missing pickups/dropoffs with 0
base = grid.merge(df[["cluster_id","datetime","pickups"] + (["dropoffs"] if has_dropoffs_truth else [])],
                  on=["cluster_id","datetime"], how="left")

base["pickups"] = base["pickups"].fillna(0).astype(float)
if has_dropoffs_truth:
    base["dropoffs"] = base["dropoffs"].fillna(0).astype(float)

# Recreate calendar features (donâ€™t trust parquet)
base["hour"] = base["datetime"].dt.hour.astype(int)
base["day_of_week"] = base["datetime"].dt.dayofweek.astype(int)
base["month"] = base["datetime"].dt.month.astype(int)
base["is_weekend"] = (base["day_of_week"] >= 5).astype(int)

# IMPORTANT: keep is_holiday from parquet if it exists, otherwise default to 0
if "is_holiday" in df.columns:
    hol = df[["cluster_id","datetime","is_holiday"]].copy()
    hol["datetime"] = pd.to_datetime(hol["datetime"]).dt.floor("h")
    base = base.merge(hol, on=["cluster_id","datetime"], how="left")
    base["is_holiday"] = base["is_holiday"].fillna(0).astype(int)
else:
    base["is_holiday"] = 0

# Leak-free lags computed from the true past series (allowed at prediction time)
base = base.sort_values(["cluster_id","datetime"]).reset_index(drop=True)
base["lag_24h"]  = base.groupby("cluster_id")["pickups"].shift(24)
base["lag_168h"] = base.groupby("cluster_id")["pickups"].shift(168)

# For dropoffs model, compute dropoff lags too (only if predicting dropoffs)
if rf_dropoffs is not None and has_dropoffs_truth:
    base["lag_24h_drop"]  = base.groupby("cluster_id")["dropoffs"].shift(24)
    base["lag_168h_drop"] = base.groupby("cluster_id")["dropoffs"].shift(168)

# Drop rows where pickups lags are not available (first 168h)
base_pred = base.dropna(subset=["lag_24h","lag_168h"]).copy()

print("[INFO] base_pred rows after lag drop:", len(base_pred))
print("[INFO] time span base_pred:", base_pred["datetime"].min(), "to", base_pred["datetime"].max())


[INFO] base_pred rows after lag drop: 2928
[INFO] time span base_pred: 2018-11-01 00:00:00 to 2018-12-31 23:00:00


### 4. Prediction and Evaluation Loop
Iterates through clusters to generate predictions for pickups and dropoffs, computes RMSE for the validation set, and saves results to parquet.

In [4]:
def make_split(df_):
    val = df_[(df_["datetime"] >= VAL_START) & (df_["datetime"] < VAL_END)].copy()
    test = df_[(df_["datetime"] >= TEST_START) & (df_["datetime"] <= TEST_END)].copy()
    return val, test

def predict_pickups(block: pd.DataFrame) -> pd.DataFrame:
    X = block[FEATURES]
    block["y_pred_rf_pickups"] = rf_pickups.predict(X)
    return block

def predict_dropoffs(block: pd.DataFrame) -> pd.DataFrame:
    # The RF model was trained on generic feature names 'lag_24h', 'lag_168h'.
    # When predicting dropoffs, we must feed the *dropoff* lags into those specific column names
    # so the model sees the correct history.
    block["lag_24h"] = block["lag_24h_drop"]
    block["lag_168h"] = block["lag_168h_drop"]
    X = block[FEATURES]
    block["y_pred_rf_dropoffs"] = rf_dropoffs.predict(X)
    return block

def finalize(block: pd.DataFrame, split_name: str) -> pd.DataFrame:
    out = pd.DataFrame({
        "date": block["datetime"].dt.normalize(),
        "hour": block["datetime"].dt.hour.astype(int),
        "cluster_id": block["cluster_id"].astype(int),
        "split": split_name,
        "y_true_pickups": block["pickups"].astype(float),
        "y_pred_rf_pickups": block["y_pred_rf_pickups"].astype(float),
    })
    out["se_pickups_rf"] = (out["y_true_pickups"] - out["y_pred_rf_pickups"])**2

    if rf_dropoffs is not None and has_dropoffs_truth and "y_pred_rf_dropoffs" in block.columns:
        out["y_true_dropoffs"] = block["dropoffs"].astype(float)
        out["y_pred_rf_dropoffs"] = block["y_pred_rf_dropoffs"].astype(float)
        out["se_dropoffs_rf"] = (out["y_true_dropoffs"] - out["y_pred_rf_dropoffs"])**2
    else:
        out["y_true_dropoffs"] = np.nan
        out["y_pred_rf_dropoffs"] = np.nan
        out["se_dropoffs_rf"] = np.nan

    return out

summaries = []

for cid in CLUSTERS_TO_MODEL:
    dfi = base_pred[base_pred["cluster_id"] == cid].copy()
    val, test = make_split(dfi)

    if len(val) == 0:
        print(f"[WARN] Cluster {cid}: empty validation slice.")
        continue

    # pickups
    val = predict_pickups(val)
    test = predict_pickups(test) if len(test) else test

    rmse_val_pickups = float(np.sqrt(mean_squared_error(val["pickups"], val["y_pred_rf_pickups"])))

    # dropoffs model (optional, but handle leak-free by using dropoff lags)
    rmse_val_dropoffs = None
    if rf_dropoffs is not None and has_dropoffs_truth:
        needed_drop_lags = {"lag_24h_drop","lag_168h_drop"}
        if not needed_drop_lags.issubset(val.columns):
            raise ValueError("Dropoffs lags missing; cannot safely predict dropoffs.")
        val = predict_dropoffs(val)
        if len(test):
            test = predict_dropoffs(test)
        rmse_val_dropoffs = float(np.sqrt(mean_squared_error(val["dropoffs"], val["y_pred_rf_dropoffs"])))

    df_val_out = finalize(val, "val")
    df_test_out = finalize(test, "test") if len(test) else pd.DataFrame(columns=df_val_out.columns)

    out = pd.concat([df_val_out, df_test_out], ignore_index=True)
    out_path = PRED_DIR / f"rf_cluster_{cid}_preds.parquet"
    out.to_parquet(out_path, index=False)

    # rmse_mean definition consistent with your ensemble code
    rmse_mean_val = 0.5*(rmse_val_pickups + (rmse_val_dropoffs if rmse_val_dropoffs is not None else rmse_val_pickups))

    summaries.append({
        "cluster_id": cid,
        "val_rmse_pickups": rmse_val_pickups,
        "val_rmse_dropoffs": rmse_val_dropoffs,
        "val_rmse_mean": rmse_mean_val,
        "parquet_path": str(out_path),
    })

    print(f"[OK] Cluster {cid} | VAL rmse_pick={rmse_val_pickups:.3f} | saved {out_path}")

df_summary = pd.DataFrame(summaries)
df_summary


[OK] Cluster 0 | VAL rmse_pick=13.361 | saved preds_rf\rf_cluster_0_preds.parquet
[OK] Cluster 8 | VAL rmse_pick=120.064 | saved preds_rf\rf_cluster_8_preds.parquet


Unnamed: 0,cluster_id,val_rmse_pickups,val_rmse_dropoffs,val_rmse_mean,parquet_path
0,0,13.360802,16.924097,15.14245,preds_rf\rf_cluster_0_preds.parquet
1,8,120.063918,165.202078,142.632998,preds_rf\rf_cluster_8_preds.parquet


### 5. Global Weighting and Artifacts
Aggregates performance metrics across clusters to calculate a global ensemble weight and saves the final configuration.

In [5]:
if len(df_summary) == 0:
    raise ValueError("No clusters exported; cannot compute general RMSE/weight.")

general_rmse_rf = float(df_summary["val_rmse_mean"].mean())
general_weight_rf = 1.0 / (general_rmse_rf + EPS)

weight_info = {
    "model": "RandomForest",
    "clusters": list(map(int, CLUSTERS_TO_MODEL)),
    "general_rmse_val_nov": general_rmse_rf,
    "general_weight_raw": general_weight_rf,
    "val_start": str(VAL_START),
    "val_end": str(VAL_END),
    "test_start": str(TEST_START),
    "features": FEATURES,
    "lags_recomputed_safely": True,
    "has_dropoffs_model": rf_dropoffs is not None,
}

weights_path = ARTIFACTS_DIR / "rf_general_weight.json"
weights_path.write_text(json.dumps(weight_info, indent=2))

print(f"[INFO] RF general VAL RMSE mean: {general_rmse_rf:.6f}")
print(f"[INFO] RF raw weight: {general_weight_rf:.6f}")
print(f"[INFO] Saved: {weights_path}")

# Optional: copy model artifact into artifacts_rf
joblib.dump(loaded, ARTIFACTS_DIR / "random_forrest_model.joblib")
print("[INFO] Copied model to artifacts_rf/random_forrest_model.joblib")


[INFO] RF general VAL RMSE mean: 78.887724
[INFO] RF raw weight: 0.012676
[INFO] Saved: artifacts_rf\rf_general_weight.json
[INFO] Copied model to artifacts_rf/random_forrest_model.joblib
