# MLP Demand Prediction (Task 2 & Task 3)

Este notebook carga el fichero `bike_data_clean.parquet` generado en `clustering_parquet_file.ipynb`
y ejecuta el modelo **MLP (Multi-Layer Perceptron)** para predecir la demanda horaria
(pickups y dropoffs) para los próximos 24h en distintos clusters, además de calcular
el número de bicicletas requeridas al inicio del día (Task 3).

Requisitos previos:
- Ejecutar antes el notebook `clustering_parquet_file.ipynb` para que exista `bike_data_clean.parquet`.
- Tener instaladas las librerías: `pandas`, `numpy`, `matplotlib`, `scikit-learn`, `pyarrow`.


In [15]:

from pathlib import Path
from typing import List, Tuple
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

import joblib

# Data
DATA_PATH = Path("bike_data_clean.parquet")

# Clusters
CLUSTERS_TO_MODEL = [0, 1]

# Splits (based on pred_dates = day being predicted)
VAL_START = pd.to_datetime("2018-10-01")
VAL_END   = pd.to_datetime("2018-11-01")  # exclusive
TEST_START = pd.to_datetime("2018-11-01")

# Output folders
ARTIFACTS_DIR = Path("artifacts_mlp")
PRED_DIR = Path("preds_mlp")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)

%matplotlib inline

In [16]:

# ------------------------------------------------------------
# Utilidades para construir la demanda horaria por cluster
# ------------------------------------------------------------

def build_hourly_demand_for_cluster(df: pd.DataFrame, cluster_id: int) -> pd.DataFrame:
    """Construye una tabla de demanda horaria para un cluster concreto.

    Salida: DataFrame con columnas:
        - date (datetime)
        - hour (0..23)
        - pickups  (viajes que empiezan en este cluster)
        - dropoffs (viajes que terminan en este cluster)
        - cluster_id
    """
    df_pick = df[df["gmm20_cluster"] == cluster_id]
    df_drop = df[df["end_gmm20_cluster"] == cluster_id]

    if df_pick.empty and df_drop.empty:
        raise ValueError(f"No hay datos de viajes para el cluster {cluster_id}")

    # --- Agregamos pickups: origen en este cluster ---
    pick_group = (
        df_pick
        .groupby(["start_date", "start_hour"])
        .size()
        .rename("pickups")
        .reset_index()
    )

    # --- Agregamos dropoffs: destino en este cluster ---
    drop_group = (
        df_drop
        .groupby(["stop_date", "stop_hour"])
        .size()
        .rename("dropoffs")
        .reset_index()
    )

    # Renombramos columnas a un formato común (date, hour)
    pick_group = pick_group.rename(
        columns={"start_date": "date", "start_hour": "hour"}
    )
    drop_group = drop_group.rename(
        columns={"stop_date": "date", "stop_hour": "hour"}
    )

    # Normalizamos las fechas a día
    pick_group["date"] = pd.to_datetime(pick_group["date"]).dt.normalize()
    drop_group["date"] = pd.to_datetime(drop_group["date"]).dt.normalize()

    # Rango completo de fechas y horas a cubrir (rellenamos con 0 cuando no hay viajes)
    min_date = min(pick_group["date"].min(), drop_group["date"].min())
    max_date = max(pick_group["date"].max(), drop_group["date"].max())
    all_dates = pd.date_range(min_date, max_date, freq="D")
    hours = np.arange(24)

    # Creamos un índice de todas las combinaciones (fecha, hora)
    idx = pd.MultiIndex.from_product([all_dates, hours], names=["date", "hour"])
    hourly = pd.DataFrame(index=idx).reset_index()

    # Hacemos join con las tablas de pickups y dropoffs
    hourly = hourly.merge(pick_group, on=["date", "hour"], how="left")
    hourly = hourly.merge(drop_group, on=["date", "hour"], how="left")

    # Rellenamos NaN con 0 (no hubo viajes esa hora)
    hourly["pickups"] = hourly["pickups"].fillna(0).astype("float32")
    hourly["dropoffs"] = hourly["dropoffs"].fillna(0).astype("float32")

    hourly["cluster_id"] = cluster_id

    return hourly


# ------------------------------------------------------------
# Dataset supervisado (día D -> día D+1)
# ------------------------------------------------------------

def build_supervised_from_hourly(
    hourly_df: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, pd.DatetimeIndex]:
    """Construye el dataset supervisado a nivel de día.

    Para cada día d:
        X_d  = [24h pickups(d), 24h dropoffs(d), weekday(d+1), month(d+1)]
        y_d  = 24h pickups(d+1)
        y2_d = 24h dropoffs(d+1)
    """
    # Pivot a matrices día x hora
    daily_p = hourly_df.pivot(index="date", columns="hour", values="pickups")
    daily_d = hourly_df.pivot(index="date", columns="hour", values="dropoffs")

    # Nos aseguramos de que existen todas las columnas 0..23
    for h in range(24):
        if h not in daily_p.columns:
            daily_p[h] = 0.0
        if h not in daily_d.columns:
            daily_d[h] = 0.0

    daily_p = daily_p[sorted(daily_p.columns)]
    daily_d = daily_d[sorted(daily_d.columns)]

    # Ordenamos por fecha
    dates = daily_p.index.sort_values()
    daily_p = daily_p.loc[dates]
    daily_d = daily_d.loc[dates]

    X_list: List[np.ndarray] = []
    Yp_list: List[np.ndarray] = []
    Yd_list: List[np.ndarray] = []
    pred_dates: List[pd.Timestamp] = []

    for i in range(len(dates) - 1):
        d = dates[i]
        d_next = dates[i + 1]

        # Features: 24h pickups(d) + 24h dropoffs(d) + calendario de d+1
        X_vec = np.concatenate(
            [
                daily_p.loc[d].values.astype("float32"),
                daily_d.loc[d].values.astype("float32"),
                np.array([d_next.weekday(), d_next.month], dtype="float32"),
            ]
        )

        # Targets: 24h del día siguiente
        y_pick = daily_p.loc[d_next].values.astype("float32")
        y_drop = daily_d.loc[d_next].values.astype("float32")

        X_list.append(X_vec)
        Yp_list.append(y_pick)
        Yd_list.append(y_drop)
        pred_dates.append(d_next)

    X = np.stack(X_list)
    Yp = np.stack(Yp_list)
    Yd = np.stack(Yd_list)
    pred_dates = pd.to_datetime(pred_dates)

    return X, Yp, Yd, pred_dates


def split_train_val_test(
    X: np.ndarray,
    Yp: np.ndarray,
    Yd: np.ndarray,
    pred_dates: pd.DatetimeIndex,
    val_start=VAL_START,
    val_end=VAL_END,
    test_start=TEST_START,
):
    """
    Temporal split based on the day being predicted (pred_dates):

    - Train: pred_dates < val_start            (Jan–Sep)
    - Val  : val_start <= pred_dates < val_end (Oct)
    - Test : pred_dates >= test_start          (Nov–Dec)
    """
    pred_dates = pd.to_datetime(pred_dates)

    train_mask = pred_dates < val_start
    val_mask   = (pred_dates >= val_start) & (pred_dates < val_end)
    test_mask  = pred_dates >= test_start

    return (
        X[train_mask], X[val_mask], X[test_mask],
        Yp[train_mask], Yp[val_mask], Yp[test_mask],
        Yd[train_mask], Yd[val_mask], Yd[test_mask],
        pred_dates[train_mask], pred_dates[val_mask], pred_dates[test_mask],
    )


In [17]:
def build_mlp(random_state=42) -> MLPRegressor:
    return MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        solver="adam",
        learning_rate_init=0.001,
        max_iter=500,
        random_state=random_state,
        early_stopping=True,
        n_iter_no_change=10,
        validation_fraction=0.1,
    )

def fit_model_with_scaler(X_train: np.ndarray, Y_train: np.ndarray, random_state=42):
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_train)
    model = build_mlp(random_state=random_state)
    model.fit(Xs, Y_train)
    return model, scaler

def predict_with(model, scaler, X):
    return model.predict(scaler.transform(X))

def to_long_df(dates, cluster_id, Yp_true, Yp_pred, Yd_true, Yd_pred, split_name: str):
    n_days = len(dates)
    hours = np.arange(24)

    df_out = pd.DataFrame({
        "date": np.repeat(dates, 24),
        "hour": np.tile(hours, n_days),
        "cluster_id": cluster_id,
        "split": split_name,
        "y_true_pickups":  Yp_true.reshape(-1),
        "y_pred_mlp_pickups": Yp_pred.reshape(-1),
        "y_true_dropoffs": Yd_true.reshape(-1),
        "y_pred_mlp_dropoffs": Yd_pred.reshape(-1),
    })
    # Absolute errors per target
    df_out["ae_pickups_mlp"]  = (df_out["y_true_pickups"]  - df_out["y_pred_mlp_pickups"]).abs()
    df_out["ae_dropoffs_mlp"] = (df_out["y_true_dropoffs"] - df_out["y_pred_mlp_dropoffs"]).abs()
    # Optional combined absolute error (useful for global weighting)
    df_out["ae_mean_mlp"] = 0.5 * (df_out["ae_pickups_mlp"] + df_out["ae_dropoffs_mlp"])
    return df_out

def evaluate_cluster_with_mlp_and_export(df: pd.DataFrame, cluster_id: int) -> Dict:
    print("\n" + "=" * 80)
    print(f"CLUSTER {cluster_id} - building hourly demand ...")
    hourly = build_hourly_demand_for_cluster(df, cluster_id)

    print("[INFO] building supervised dataset (day D -> day D+1) ...")
    X, Yp, Yd, pred_dates = build_supervised_from_hourly(hourly)

    (
        X_train, X_val, X_test,
        Yp_train, Yp_val, Yp_test,
        Yd_train, Yd_val, Yd_test,
        dates_train, dates_val, dates_test,
    ) = split_train_val_test(X, Yp, Yd, pred_dates)

    print(f"[INFO] Samples - train: {len(dates_train)}, val(Oct): {len(dates_val)}, test(Nov-Dec): {len(dates_test)}")
    if len(dates_val) == 0:
        raise ValueError("Validation set is empty (October). Check date ranges in your dataset.")
    if len(dates_test) == 0:
        print("[WARN] Test set is empty (Nov-Dec). Will still compute validation MAE and save artifacts.")

    # =========================
    # 1) Train on TRAIN, evaluate on VAL (MAE for weighting)
    # =========================
    print("[INFO] Training (train-only) -> validating on October to compute MAE ...")

    mlp_p_tr, sc_p_tr = fit_model_with_scaler(X_train, Yp_train, random_state=42)
    mlp_d_tr, sc_d_tr = fit_model_with_scaler(X_train, Yd_train, random_state=42)

    Yp_val_pred = predict_with(mlp_p_tr, sc_p_tr, X_val)
    Yd_val_pred = predict_with(mlp_d_tr, sc_d_tr, X_val)

    mae_val_pickups  = mean_absolute_error(Yp_val.reshape(-1), Yp_val_pred.reshape(-1))
    mae_val_dropoffs = mean_absolute_error(Yd_val.reshape(-1), Yd_val_pred.reshape(-1))
    mae_val_mean = 0.5 * (mae_val_pickups + mae_val_dropoffs)

    print(f"[VAL MAE] Cluster {cluster_id}: pickups={mae_val_pickups:.4f} | dropoffs={mae_val_dropoffs:.4f} | mean={mae_val_mean:.4f}")

    # =========================
    # 2) Retrain final model on TRAIN+VAL (Jan–Oct), then predict TEST
    # =========================
    print("[INFO] Training final models on train+val (Jan–Oct) ...")
    X_trval = np.vstack([X_train, X_val])
    Yp_trval = np.vstack([Yp_train, Yp_val])
    Yd_trval = np.vstack([Yd_train, Yd_val])

    mlp_p_final, sc_p_final = fit_model_with_scaler(X_trval, Yp_trval, random_state=42)
    mlp_d_final, sc_d_final = fit_model_with_scaler(X_trval, Yd_trval, random_state=42)

    # Save artifacts
    artifact = {
        "cluster_id": cluster_id,
        "val_mae_pickups": float(mae_val_pickups),
        "val_mae_dropoffs": float(mae_val_dropoffs),
        "val_mae_mean": float(mae_val_mean),
        "model_pickups": mlp_p_final,
        "scaler_pickups": sc_p_final,
        "model_dropoffs": mlp_d_final,
        "scaler_dropoffs": sc_d_final,
        "splits": {
            "val_start": str(VAL_START.date()),
            "val_end": str(VAL_END.date()),
            "test_start": str(TEST_START.date()),
        }
    }
    joblib_path = ARTIFACTS_DIR / f"mlp_cluster_{cluster_id}.joblib"
    joblib.dump(artifact, joblib_path)
    print(f"[INFO] Saved joblib: {joblib_path}")

    # =========================
    # 3) Export per-hour predictions + ground truths for VAL and TEST
    # =========================
    dfs = []

    # VAL preds (use train-only models, so the parquet reflects the same setup used for MAE)
    df_val = to_long_df(
        dates_val, cluster_id,
        Yp_val, Yp_val_pred,
        Yd_val, Yd_val_pred,
        split_name="val"
    )
    dfs.append(df_val)

    # TEST preds (use final models trained on train+val)
    if len(dates_test) > 0:
        Yp_test_pred = predict_with(mlp_p_final, sc_p_final, X_test)
        Yd_test_pred = predict_with(mlp_d_final, sc_d_final, X_test)

        df_test = to_long_df(
            dates_test, cluster_id,
            Yp_test, Yp_test_pred,
            Yd_test, Yd_test_pred,
            split_name="test"
        )
        dfs.append(df_test)

    df_preds = pd.concat(dfs, ignore_index=True)

    parquet_path = PRED_DIR / f"mlp_cluster_{cluster_id}_preds.parquet"
    df_preds.to_parquet(parquet_path, index=False)
    print(f"[INFO] Saved parquet: {parquet_path}  shape={df_preds.shape}")

    # Return summary (useful to aggregate later)
    return {
        "cluster_id": cluster_id,
        "val_mae_pickups": float(mae_val_pickups),
        "val_mae_dropoffs": float(mae_val_dropoffs),
        "val_mae_mean": float(mae_val_mean),
        "joblib_path": str(joblib_path),
        "parquet_path": str(parquet_path),
    }


In [18]:
print("=" * 80)
print("MLP DEMAND PREDICTION (Train/Val/Test + Export + Weight)")
print("=" * 80)

if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing {DATA_PATH}. Run clustering notebook first.")

df = pd.read_parquet(DATA_PATH)
print(f"[INFO] DataFrame shape: {df.shape}")

summaries = []
for cluster_id in CLUSTERS_TO_MODEL:
    try:
        summary = evaluate_cluster_with_mlp_and_export(df, cluster_id)
        summaries.append(summary)
    except ValueError as e:
        print(f"[WARN] Cluster {cluster_id}: {e}")

df_summary = pd.DataFrame(summaries)
print("\nPer-cluster October validation MAE summary:")
print(df_summary[["cluster_id", "val_mae_pickups", "val_mae_dropoffs", "val_mae_mean"]])

# ---- A single "general MAE" for the MLP model across your selected clusters ----
if len(df_summary) > 0:
    general_mae_mlp = float(df_summary["val_mae_mean"].mean())
    eps = 1e-6
    general_weight_mlp = 1.0 / (general_mae_mlp + eps)  # unnormalized weight; normalize later across models

    weights_info = {
        "model": "MLP",
        "clusters": CLUSTERS_TO_MODEL,
        "general_mae_val_oct": general_mae_mlp,
        "general_weight_raw": general_weight_mlp,
        "val_start": str(VAL_START.date()),
        "val_end": str(VAL_END.date()),
    }
    weights_path = ARTIFACTS_DIR / "mlp_general_weight.json"
    import json
    weights_path.write_text(json.dumps(weights_info, indent=2))
    print(f"\n[INFO] MLP general October MAE: {general_mae_mlp:.6f}")
    print(f"[INFO] MLP raw weight (inverse MAE): {general_weight_mlp:.6f}")
    print(f"[INFO] Saved weight info: {weights_path}")
else:
    print("[WARN] No clusters produced results; cannot compute general MAE/weight.")


MLP DEMAND PREDICTION (Train/Val/Test + Export + Weight)
[INFO] DataFrame shape: (17531179, 22)

CLUSTER 0 - building hourly demand ...
[INFO] building supervised dataset (day D -> day D+1) ...
[INFO] Samples - train: 272, val(Oct): 31, test(Nov-Dec): 62
[INFO] Training (train-only) -> validating on October to compute MAE ...
[VAL MAE] Cluster 0: pickups=13.0729 | dropoffs=9.6840 | mean=11.3784
[INFO] Training final models on train+val (Jan–Oct) ...
[INFO] Saved joblib: artifacts_mlp/mlp_cluster_0.joblib
[INFO] Saved parquet: preds_mlp/mlp_cluster_0_preds.parquet  shape=(2232, 11)

CLUSTER 1 - building hourly demand ...
[INFO] building supervised dataset (day D -> day D+1) ...
[INFO] Samples - train: 272, val(Oct): 31, test(Nov-Dec): 61
[INFO] Training (train-only) -> validating on October to compute MAE ...
[VAL MAE] Cluster 1: pickups=33.9725 | dropoffs=33.0744 | mean=33.5235
[INFO] Training final models on train+val (Jan–Oct) ...
[INFO] Saved joblib: artifacts_mlp/mlp_cluster_1.jobl