In [4]:
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

# ===== INPUTS =====
DATA_PATH = Path("bike_data_clean.parquet")

# Your pretrained regressors (pickups + dropoffs)
DEP_MODEL_PATH = Path("alex_ridge_model.joblib")  # pickups model
ARR_MODEL_PATH = Path("alex_ridge_model_dropoffs.joblib")    # dropoffs model

CLUSTERS_TO_MODEL = [0, 1]

# Weights from October
VAL_START = pd.to_datetime("2018-10-01")
VAL_END   = pd.to_datetime("2018-11-01")  # exclusive
TEST_START = pd.to_datetime("2018-11-01")

# Feature columns used in training (from your code)
NUM_FEATS = ['lag_1','lag_24','lag_168','rmean_3h','rmean_24h']
CAT_FEATS = ['hour','weekday','gmm20_cluster']
FEATURES = NUM_FEATS + CAT_FEATS

# Lag/rolling history needed before Oct 1
# Need at least 168 hours + 24 hours + 1 hour buffer -> ~8 days is safe
HISTORY_BUFFER_DAYS = 10

# We'll build data only up to end of Dec
END_TIME = pd.Timestamp("2018-12-31 23:00:00")

# ===== OUTPUTS =====
ARTIFACTS_DIR = Path("artifacts_regressor")
PRED_DIR = Path("preds_regressor")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)


In [6]:
from sklearn.base import TransformerMixin, BaseEstimator

class DenseTransformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        try:
            return X.toarray()
        except Exception:
            return X


In [7]:
assert DEP_MODEL_PATH.exists(), f"Missing: {DEP_MODEL_PATH}"
assert ARR_MODEL_PATH.exists(), f"Missing: {ARR_MODEL_PATH}"

dep_model = joblib.load(DEP_MODEL_PATH)
arr_model = joblib.load(ARR_MODEL_PATH)

print("[INFO] Departures model:", type(dep_model))
print("[INFO] Arrivals model  :", type(arr_model))


[INFO] Departures model: <class 'sklearn.pipeline.Pipeline'>
[INFO] Arrivals model  : <class 'sklearn.pipeline.Pipeline'>


In [8]:
# Time window to load: (Oct 1 - buffer) ... Dec 31
START_TIME = (VAL_START - pd.Timedelta(days=HISTORY_BUFFER_DAYS)).floor("D")
print("[INFO] Using time window:", START_TIME, "to", END_TIME)

# Read only necessary columns
needed_cols = ["gmm20_cluster","end_gmm20_cluster","start_date","start_hour","stop_date","stop_hour"]
df = pd.read_parquet(DATA_PATH, columns=needed_cols)

clusters = set(CLUSTERS_TO_MODEL)
mask = df["gmm20_cluster"].isin(clusters) | df["end_gmm20_cluster"].isin(clusters)
df = df.loc[mask].copy()

print("[INFO] Filtered df shape:", df.shape)
print("[INFO] Columns:", list(df.columns))


[INFO] Using time window: 2018-09-21 00:00:00 to 2018-12-31 23:00:00
[INFO] Filtered df shape: (1540295, 6)
[INFO] Columns: ['gmm20_cluster', 'end_gmm20_cluster', 'start_date', 'start_hour', 'stop_date', 'stop_hour']


In [9]:
# Build hourly timestamps (vectorized)
start_ts = pd.to_datetime(df["start_date"]).dt.normalize() + pd.to_timedelta(df["start_hour"], unit="h")
stop_ts  = pd.to_datetime(df["stop_date"]).dt.normalize()  + pd.to_timedelta(df["stop_hour"], unit="h")

# Pickups aggregation
pick_mask = df["gmm20_cluster"].isin(clusters)
pick_tbl = pd.DataFrame({
    "gmm20_cluster": df.loc[pick_mask, "gmm20_cluster"].astype("int16").values,
    "timestamp": start_ts.loc[pick_mask].values
})
pick_tbl["timestamp"] = pd.to_datetime(pick_tbl["timestamp"]).dt.floor("H")
pick_tbl = pick_tbl[(pick_tbl["timestamp"] >= START_TIME) & (pick_tbl["timestamp"] <= END_TIME)]
agg_pick = (
    pick_tbl.groupby(["gmm20_cluster","timestamp"])
    .size()
    .rename("pickups")
    .reset_index()
)

# Dropoffs aggregation
drop_mask = df["end_gmm20_cluster"].isin(clusters)
drop_tbl = pd.DataFrame({
    "gmm20_cluster": df.loc[drop_mask, "end_gmm20_cluster"].astype("int16").values,
    "timestamp": stop_ts.loc[drop_mask].values
})
drop_tbl["timestamp"] = pd.to_datetime(drop_tbl["timestamp"]).dt.floor("H")
drop_tbl = drop_tbl[(drop_tbl["timestamp"] >= START_TIME) & (drop_tbl["timestamp"] <= END_TIME)]
agg_drop = (
    drop_tbl.groupby(["gmm20_cluster","timestamp"])
    .size()
    .rename("dropoffs")
    .reset_index()
)

print("[INFO] agg_pick shape:", agg_pick.shape)
print("[INFO] agg_drop shape:", agg_drop.shape)
print(agg_pick.head())


[INFO] agg_pick shape: (4729, 3)
[INFO] agg_drop shape: (4784, 3)
   gmm20_cluster           timestamp  pickups
0              0 2018-09-21 00:00:00       18
1              0 2018-09-21 01:00:00        8
2              0 2018-09-21 02:00:00        9
3              0 2018-09-21 03:00:00        3
4              0 2018-09-21 05:00:00        2


  pick_tbl["timestamp"] = pd.to_datetime(pick_tbl["timestamp"]).dt.floor("H")
  drop_tbl["timestamp"] = pd.to_datetime(drop_tbl["timestamp"]).dt.floor("H")


In [10]:
def make_hour_grid(clusters_list, start, end):
    hours = pd.date_range(start, end, freq="H")
    grid = pd.MultiIndex.from_product([clusters_list, hours], names=["gmm20_cluster","timestamp"]).to_frame(index=False)
    return grid

grid = make_hour_grid(CLUSTERS_TO_MODEL, START_TIME, END_TIME)

ts_pick = grid.merge(agg_pick, on=["gmm20_cluster","timestamp"], how="left")
ts_pick["pickups"] = ts_pick["pickups"].fillna(0).astype("float32")

ts_drop = grid.merge(agg_drop, on=["gmm20_cluster","timestamp"], how="left")
ts_drop["dropoffs"] = ts_drop["dropoffs"].fillna(0).astype("float32")

print("[INFO] ts_pick shape:", ts_pick.shape, "ts_drop shape:", ts_drop.shape)


[INFO] ts_pick shape: (4896, 3) ts_drop shape: (4896, 3)


  hours = pd.date_range(start, end, freq="H")


In [11]:
def add_lags_rolls(df_in, value_col, lags=(1,24,168)):
    df = df_in.sort_values(["gmm20_cluster","timestamp"]).copy()

    for lag in lags:
        df[f"lag_{lag}"] = df.groupby("gmm20_cluster")[value_col].shift(lag)

    shifted = df.groupby("gmm20_cluster")[value_col].shift(1)
    df["rmean_3h"]  = shifted.groupby(df["gmm20_cluster"]).rolling(3).mean().reset_index(level=0, drop=True)
    df["rmean_24h"] = shifted.groupby(df["gmm20_cluster"]).rolling(24).mean().reset_index(level=0, drop=True)

    df["hour"] = df["timestamp"].dt.hour.astype("int8")
    df["weekday"] = df["timestamp"].dt.weekday.astype("int8")

    # This is your target in training
    df["y"] = df[value_col].astype(float)
    return df

ts_pick_feat = add_lags_rolls(ts_pick, "pickups", lags=(1,24,168)).dropna().reset_index(drop=True)
ts_drop_feat = add_lags_rolls(ts_drop, "dropoffs", lags=(1,24,168)).dropna().reset_index(drop=True)

print("[INFO] ts_pick_feat cols:", [c for c in FEATURES if c in ts_pick_feat.columns])
print("[INFO] ts_drop_feat cols:", [c for c in FEATURES if c in ts_drop_feat.columns])
print("[INFO] ts_pick_feat shape:", ts_pick_feat.shape)
print("[INFO] ts_drop_feat shape:", ts_drop_feat.shape)


[INFO] ts_pick_feat cols: ['lag_1', 'lag_24', 'lag_168', 'rmean_3h', 'rmean_24h', 'hour', 'weekday', 'gmm20_cluster']
[INFO] ts_drop_feat cols: ['lag_1', 'lag_24', 'lag_168', 'rmean_3h', 'rmean_24h', 'hour', 'weekday', 'gmm20_cluster']
[INFO] ts_pick_feat shape: (4560, 11)
[INFO] ts_drop_feat shape: (4560, 11)


In [12]:
missing_pick = [c for c in FEATURES if c not in ts_pick_feat.columns]
missing_drop = [c for c in FEATURES if c not in ts_drop_feat.columns]
if missing_pick:
    raise ValueError(f"Pickups feature table missing columns: {missing_pick}")
if missing_drop:
    raise ValueError(f"Dropoffs feature table missing columns: {missing_drop}")

oct_pick = ts_pick_feat[(ts_pick_feat["timestamp"] >= VAL_START) & (ts_pick_feat["timestamp"] < VAL_END)]
oct_drop = ts_drop_feat[(ts_drop_feat["timestamp"] >= VAL_START) & (ts_drop_feat["timestamp"] < VAL_END)]
if len(oct_pick) == 0 or len(oct_drop) == 0:
    raise ValueError("No October rows found after feature creation. Check time window / buffer.")

print("[OK] Sanity passed: features + October coverage.")
print("[INFO] October rows pickups:", len(oct_pick), "dropoffs:", len(oct_drop))


[OK] Sanity passed: features + October coverage.
[INFO] October rows pickups: 1488 dropoffs: 1488


In [13]:
def split_block(ts):
    val = ts[(ts["timestamp"] >= VAL_START) & (ts["timestamp"] < VAL_END)].copy()
    test = ts[ts["timestamp"] >= TEST_START].copy()
    return val, test

def to_output_df(cluster_id, split_name, pick_block, drop_block, pred_pick, pred_drop):
    # inner join on timestamp/hour for safety
    a = pick_block[["timestamp","hour","y"]].rename(columns={"y":"y_true_pickups"})
    b = drop_block[["timestamp","hour","y"]].rename(columns={"y":"y_true_dropoffs"})
    out = a.merge(b, on=["timestamp","hour"], how="inner")

    out["y_pred_reg_pickups"] = pred_pick[:len(out)]
    out["y_pred_reg_dropoffs"] = pred_drop[:len(out)]

    out["ae_pickups_reg"] = (out["y_true_pickups"] - out["y_pred_reg_pickups"]).abs()
    out["ae_dropoffs_reg"] = (out["y_true_dropoffs"] - out["y_pred_reg_dropoffs"]).abs()
    out["ae_mean_reg"] = 0.5 * (out["ae_pickups_reg"] + out["ae_dropoffs_reg"])

    out["date"] = pd.to_datetime(out["timestamp"]).dt.normalize()
    out["cluster_id"] = cluster_id
    out["split"] = split_name

    return out[[
        "date","hour","cluster_id","split",
        "y_true_pickups","y_pred_reg_pickups",
        "y_true_dropoffs","y_pred_reg_dropoffs",
        "ae_pickups_reg","ae_dropoffs_reg","ae_mean_reg"
    ]]

summaries = []

for cid in CLUSTERS_TO_MODEL:
    pick_c = ts_pick_feat[ts_pick_feat["gmm20_cluster"] == cid].copy()
    drop_c = ts_drop_feat[ts_drop_feat["gmm20_cluster"] == cid].copy()

    val_p, test_p = split_block(pick_c)
    val_d, test_d = split_block(drop_c)

    # --- October predictions (for MAE / weight) ---
    pred_val_p = dep_model.predict(val_p[FEATURES])
    pred_val_d = arr_model.predict(val_d[FEATURES])

    mae_val_pickups  = mean_absolute_error(val_p["y"].values, pred_val_p)
    mae_val_dropoffs = mean_absolute_error(val_d["y"].values, pred_val_d)
    mae_val_mean = 0.5 * (mae_val_pickups + mae_val_dropoffs)

    df_val_out = to_output_df(cid, "val", val_p, val_d, pred_val_p, pred_val_d)

    # --- Novâ€“Dec predictions ---
    if len(test_p) > 0 and len(test_d) > 0:
        pred_test_p = dep_model.predict(test_p[FEATURES])
        pred_test_d = arr_model.predict(test_d[FEATURES])
        df_test_out = to_output_df(cid, "test", test_p, test_d, pred_test_p, pred_test_d)
        df_out = pd.concat([df_val_out, df_test_out], ignore_index=True)
    else:
        df_out = df_val_out.copy()

    parquet_path = PRED_DIR / f"reg_cluster_{cid}_preds.parquet"
    df_out.to_parquet(parquet_path, index=False)

    summaries.append({
        "cluster_id": cid,
        "val_mae_pickups": float(mae_val_pickups),
        "val_mae_dropoffs": float(mae_val_dropoffs),
        "val_mae_mean": float(mae_val_mean),
        "parquet_path": str(parquet_path),
    })

    print(f"[OK] Cluster {cid} | Oct MAE mean={mae_val_mean:.6f} | saved {parquet_path}")

df_summary = pd.DataFrame(summaries)
df_summary


[OK] Cluster 0 | Oct MAE mean=12.332591 | saved preds_regressor/reg_cluster_0_preds.parquet
[OK] Cluster 1 | Oct MAE mean=19.066918 | saved preds_regressor/reg_cluster_1_preds.parquet


Unnamed: 0,cluster_id,val_mae_pickups,val_mae_dropoffs,val_mae_mean,parquet_path
0,0,12.110037,12.555144,12.332591,preds_regressor/reg_cluster_0_preds.parquet
1,1,20.278767,17.85507,19.066918,preds_regressor/reg_cluster_1_preds.parquet


In [14]:
if len(df_summary) == 0:
    raise ValueError("No clusters processed; cannot compute general MAE/weight.")

general_mae_reg = float(df_summary["val_mae_mean"].mean())
eps = 1e-6
general_weight_reg = 1.0 / (general_mae_reg + eps)  # raw; normalize later across models

info = {
    "model": "Regressor",
    "clusters": CLUSTERS_TO_MODEL,
    "general_mae_val_oct": general_mae_reg,
    "general_weight_raw": general_weight_reg,
    "val_start": str(VAL_START.date()),
    "val_end": str(VAL_END.date()),
    "features": FEATURES,
    "num_feats": NUM_FEATS,
    "cat_feats": CAT_FEATS,
    "history_buffer_days": HISTORY_BUFFER_DAYS,
    "dep_model_path": str(DEP_MODEL_PATH),
    "arr_model_path": str(ARR_MODEL_PATH),
}

(ARTIFACTS_DIR / "reg_general_weight.json").write_text(json.dumps(info, indent=2))
joblib.dump(dep_model, ARTIFACTS_DIR / "regressor_departures.joblib")
joblib.dump(arr_model, ARTIFACTS_DIR / "regressor_arrivals.joblib")

print(f"[INFO] General October MAE (Regressor): {general_mae_reg:.6f}")
print(f"[INFO] Raw weight (inverse MAE):        {general_weight_reg:.6f}")
print(f"[INFO] Saved: {ARTIFACTS_DIR / 'reg_general_weight.json'}")


[INFO] General October MAE (Regressor): 15.699755
[INFO] Raw weight (inverse MAE):        0.063695
[INFO] Saved: artifacts_regressor/reg_general_weight.json
