# MLP Demand Prediction (Task 2 & Task 3)

Este notebook carga el fichero `bike_data_clean.parquet` generado en `clustering_parquet_file.ipynb`
y ejecuta el modelo **MLP (Multi-Layer Perceptron)** para predecir la demanda horaria
(pickups y dropoffs) para los próximos 24h en distintos clusters, además de calcular
el número de bicicletas requeridas al inicio del día (Task 3).

Requisitos previos:
- Ejecutar antes el notebook `clustering_parquet_file.ipynb` para que exista `bike_data_clean.parquet`.
- Tener instaladas las librerías: `pandas`, `numpy`, `matplotlib`, `scikit-learn`, `pyarrow`.


In [None]:

from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Ruta del fichero Parquet generado en el notebook de clustering
DATA_PATH = Path("bike_data_clean.parquet")

# Clusters a modelar (puedes cambiar esta lista)
CLUSTERS_TO_MODEL = [0, 1]

# Mostrar todas las figuras inline
%matplotlib inline


In [None]:

# ------------------------------------------------------------
# Utilidades para construir la demanda horaria por cluster
# ------------------------------------------------------------

def build_hourly_demand_for_cluster(df: pd.DataFrame, cluster_id: int) -> pd.DataFrame:
    """Construye una tabla de demanda horaria para un cluster concreto.

    Salida: DataFrame con columnas:
        - date (datetime)
        - hour (0..23)
        - pickups  (viajes que empiezan en este cluster)
        - dropoffs (viajes que terminan en este cluster)
        - cluster_id
    """
    df_pick = df[df["gmm20_cluster"] == cluster_id]
    df_drop = df[df["end_gmm20_cluster"] == cluster_id]

    if df_pick.empty and df_drop.empty:
        raise ValueError(f"No hay datos de viajes para el cluster {cluster_id}")

    # --- Agregamos pickups: origen en este cluster ---
    pick_group = (
        df_pick
        .groupby(["start_date", "start_hour"])
        .size()
        .rename("pickups")
        .reset_index()
    )

    # --- Agregamos dropoffs: destino en este cluster ---
    drop_group = (
        df_drop
        .groupby(["stop_date", "stop_hour"])
        .size()
        .rename("dropoffs")
        .reset_index()
    )

    # Renombramos columnas a un formato común (date, hour)
    pick_group = pick_group.rename(
        columns={"start_date": "date", "start_hour": "hour"}
    )
    drop_group = drop_group.rename(
        columns={"stop_date": "date", "stop_hour": "hour"}
    )

    # Normalizamos las fechas a día
    pick_group["date"] = pd.to_datetime(pick_group["date"]).dt.normalize()
    drop_group["date"] = pd.to_datetime(drop_group["date"]).dt.normalize()

    # Rango completo de fechas y horas a cubrir (rellenamos con 0 cuando no hay viajes)
    min_date = min(pick_group["date"].min(), drop_group["date"].min())
    max_date = max(pick_group["date"].max(), drop_group["date"].max())
    all_dates = pd.date_range(min_date, max_date, freq="D")
    hours = np.arange(24)

    # Creamos un índice de todas las combinaciones (fecha, hora)
    idx = pd.MultiIndex.from_product([all_dates, hours], names=["date", "hour"])
    hourly = pd.DataFrame(index=idx).reset_index()

    # Hacemos join con las tablas de pickups y dropoffs
    hourly = hourly.merge(pick_group, on=["date", "hour"], how="left")
    hourly = hourly.merge(drop_group, on=["date", "hour"], how="left")

    # Rellenamos NaN con 0 (no hubo viajes esa hora)
    hourly["pickups"] = hourly["pickups"].fillna(0).astype("float32")
    hourly["dropoffs"] = hourly["dropoffs"].fillna(0).astype("float32")

    hourly["cluster_id"] = cluster_id

    return hourly


# ------------------------------------------------------------
# Dataset supervisado (día D -> día D+1)
# ------------------------------------------------------------

def build_supervised_from_hourly(
    hourly_df: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, pd.DatetimeIndex]:
    """Construye el dataset supervisado a nivel de día.

    Para cada día d:
        X_d  = [24h pickups(d), 24h dropoffs(d), weekday(d+1), month(d+1)]
        y_d  = 24h pickups(d+1)
        y2_d = 24h dropoffs(d+1)
    """
    # Pivot a matrices día x hora
    daily_p = hourly_df.pivot(index="date", columns="hour", values="pickups")
    daily_d = hourly_df.pivot(index="date", columns="hour", values="dropoffs")

    # Nos aseguramos de que existen todas las columnas 0..23
    for h in range(24):
        if h not in daily_p.columns:
            daily_p[h] = 0.0
        if h not in daily_d.columns:
            daily_d[h] = 0.0

    daily_p = daily_p[sorted(daily_p.columns)]
    daily_d = daily_d[sorted(daily_d.columns)]

    # Ordenamos por fecha
    dates = daily_p.index.sort_values()
    daily_p = daily_p.loc[dates]
    daily_d = daily_d.loc[dates]

    X_list: List[np.ndarray] = []
    Yp_list: List[np.ndarray] = []
    Yd_list: List[np.ndarray] = []
    pred_dates: List[pd.Timestamp] = []

    for i in range(len(dates) - 1):
        d = dates[i]
        d_next = dates[i + 1]

        # Features: 24h pickups(d) + 24h dropoffs(d) + calendario de d+1
        X_vec = np.concatenate(
            [
                daily_p.loc[d].values.astype("float32"),
                daily_d.loc[d].values.astype("float32"),
                np.array([d_next.weekday(), d_next.month], dtype="float32"),
            ]
        )

        # Targets: 24h del día siguiente
        y_pick = daily_p.loc[d_next].values.astype("float32")
        y_drop = daily_d.loc[d_next].values.astype("float32")

        X_list.append(X_vec)
        Yp_list.append(y_pick)
        Yd_list.append(y_drop)
        pred_dates.append(d_next)

    X = np.stack(X_list)
    Yp = np.stack(Yp_list)
    Yd = np.stack(Yd_list)
    pred_dates = pd.to_datetime(pred_dates)

    return X, Yp, Yd, pred_dates


def split_train_test(
    X: np.ndarray,
    Yp: np.ndarray,
    Yd: np.ndarray,
    pred_dates: pd.DatetimeIndex,
):
    """Split temporal train/test:

    - Train: fechas predichas < 2018-11-01 (ene–oct)
    - Test : fechas predichas >= 2018-11-01 (nov–dic)
    """
    cutoff = pd.to_datetime("2018-11-01")
    train_mask = pred_dates < cutoff
    test_mask = pred_dates >= cutoff

    X_train = X[train_mask]
    X_test = X[test_mask]
    Yp_train = Yp[train_mask]
    Yp_test = Yp[test_mask]
    Yd_train = Yd[train_mask]
    Yd_test = Yd[test_mask]
    dates_train = pred_dates[train_mask]
    dates_test = pred_dates[test_mask]

    return X_train, X_test, Yp_train, Yp_test, Yd_train, Yd_test, dates_train, dates_test


In [None]:

# ------------------------------------------------------------
# MLP + Task 3 (bicis requeridas)
# ------------------------------------------------------------

def train_mlp(X_train: np.ndarray, Y_train: np.ndarray):
    """Entrena un MLPRegressor para regresión multi-salida.

    Devuelve el modelo y el scaler usado en las features.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    mlp = MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        solver="adam",
        learning_rate_init=0.001,
        max_iter=500,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=10,
        validation_fraction=0.1,
    )

    mlp.fit(X_train_scaled, Y_train)
    return mlp, scaler


def compute_required_bikes_series(Yp: np.ndarray, Yd: np.ndarray) -> np.ndarray:
    """Task 3: calcula el nº de bicis requeridas al inicio del día.

    net_t = dropoffs_t - pickups_t
    cum_net_k = sum_{t=0..k} net_t
    required_bikes = max(0, -min_k cum_net_k)
    """
    required = []
    for pickups, dropoffs in zip(Yp, Yd):
        net = dropoffs - pickups
        cum_net = np.cumsum(net)
        min_cum = float(cum_net.min())
        req = max(0.0, -min_cum)
        required.append(req)
    return np.array(required, dtype="float32")


def evaluate_cluster_with_mlp(
    df: pd.DataFrame,
    cluster_id: int,
):
    """Pipeline completo para un cluster:
      - Demanda horaria
      - Dataset supervisado día -> día+1
      - Split temporal
      - Entrenamiento de dos MLP (pickups y dropoffs)
      - Evaluación
      - Cálculo de bicis requeridas (Task 3)
      - Gráficos principales
    """
    print("\n" + "=" * 80)
    print(f"CLUSTER {cluster_id} - Construyendo demanda horaria ...")
    hourly = build_hourly_demand_for_cluster(df, cluster_id)
    print(f"[INFO] hourly shape: {hourly.shape}")

    print("[INFO] Construyendo dataset supervisado día -> día+1 ...")
    X, Yp, Yd, pred_dates = build_supervised_from_hourly(hourly)
    print(f"[INFO] Número de muestras (días-1): {X.shape[0]}")

    print("[INFO] Split temporal (train = ene–oct, test = nov–dic) ...")
    (
        X_train,
        X_test,
        Yp_train,
        Yp_test,
        Yd_train,
        Yd_test,
        dates_train,
        dates_test,
    ) = split_train_test(X, Yp, Yd, pred_dates)

    print(f"  Train samples: {X_train.shape[0]}")
    print(f"  Test samples : {X_test.shape[0]}")

    if X_test.shape[0] == 0:
        print("[WARN] No hay días en el test set para este cluster (nov–dic). Se omite.")
        return

    # --------------------- Entrenamiento MLP ---------------------
    print("[INFO] Entrenando MLP para pickups ...")
    mlp_pickups, scaler_p = train_mlp(X_train, Yp_train)
    X_test_scaled_p = scaler_p.transform(X_test)
    Yp_pred_test = mlp_pickups.predict(X_test_scaled_p)

    print("[INFO] Entrenando MLP para dropoffs ...")
    mlp_dropoffs, scaler_d = train_mlp(X_train, Yd_train)
    X_test_scaled_d = scaler_d.transform(X_test)
    Yd_pred_test = mlp_dropoffs.predict(X_test_scaled_d)

    # --------------------- Métricas ---------------------
    rmse_p = np.sqrt(mean_squared_error(Yp_test.ravel(), Yp_pred_test.ravel()))
    mae_p = mean_absolute_error(Yp_test.ravel(), Yp_pred_test.ravel())
    rmse_d = np.sqrt(mean_squared_error(Yd_test.ravel(), Yd_pred_test.ravel()))
    mae_d = mean_absolute_error(Yd_test.ravel(), Yd_pred_test.ravel())

    print(f"\n[RESULTADOS] Cluster {cluster_id}")
    print(f"  Pickups  - RMSE={rmse_p:.3f}, MAE={mae_p:.3f}")
    print(f"  Dropoffs - RMSE={rmse_d:.3f}, MAE={mae_d:.3f}")

    # --------------------- Task 3: Bicis requeridas ---------------------
    print("[INFO] Calculando bicis requeridas (Task 3) con predicciones y datos reales ...")
    required_pred = compute_required_bikes_series(Yp_pred_test, Yd_pred_test)
    required_true = compute_required_bikes_series(Yp_test, Yd_test)

    df_req = pd.DataFrame(
        {
            "date": dates_test,
            "required_bikes_true": required_true,
            "required_bikes_pred": required_pred,
        }
    )

    print("\n[Task 3] Ejemplo de bicis requeridas (primeros 10 días de test):")
    print(df_req.head(10))

    # --------------------- Gráficos ---------------------
    hours = np.arange(24)

    # 1) Ejemplo de día de test (primer día)
    idx0 = 0
    date0 = dates_test[idx0].date()

    plt.figure(figsize=(9, 5))
    plt.plot(hours, Yp_test[idx0], marker="o", label="Actual pickups")
    plt.plot(hours, Yp_pred_test[idx0], marker="x", label="Predicted pickups")
    plt.xlabel("Hour")
    plt.ylabel("Demand")
    plt.title(f"Cluster {cluster_id} - Pickups - {date0}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    plt.figure(figsize=(9, 5))
    plt.plot(hours, Yd_test[idx0], marker="o", label="Actual dropoffs")
    plt.plot(hours, Yd_pred_test[idx0], marker="x", label="Predicted dropoffs")
    plt.xlabel("Hour")
    plt.ylabel("Demand")
    plt.title(f"Cluster {cluster_id} - Dropoffs - {date0}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # 2) Bicis requeridas en varios días de test (barras)
    plt.figure(figsize=(10, 5))
    x = np.arange(min(10, len(df_req)))  # como mucho 10 días para visualizar

    plt.bar(
        x - 0.15,
        df_req["required_bikes_true"].values[: len(x)],
        width=0.3,
        label="True",
    )
    plt.bar(
        x + 0.15,
        df_req["required_bikes_pred"].values[: len(x)],
        width=0.3,
        label="Predicted",
    )

    # Etiquetas de fecha
    date_labels = df_req["date"].dt.strftime("%Y-%m-%d").values[: len(x)]
    plt.xticks(x, date_labels, rotation=45)

    plt.ylabel("Required bikes at start of day")
    plt.title(f"Cluster {cluster_id} - Required bikes (Task 3) - First test days")
    plt.legend()
    plt.tight_layout()

    plt.show()


In [None]:

# ------------------------------------------------------------
# Ejecución del pipeline MLP
# ------------------------------------------------------------

print("=" * 80)
print("MLP DEMAND PREDICTION (Task 2 & Task 3)")
print("=" * 80)
print(f"[INFO] Leyendo datos desde {DATA_PATH.resolve()} ...")

if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"No se ha encontrado {DATA_PATH}. "
        "Ejecuta antes el notebook de clustering para generar bike_data_clean.parquet."
    )

df = pd.read_parquet(DATA_PATH)

print(f"[INFO] DataFrame shape: {df.shape}")
print("[INFO] Columnas disponibles (primeras 20):")
print(df.columns[:20])

for cluster_id in CLUSTERS_TO_MODEL:
    try:
        evaluate_cluster_with_mlp(df, cluster_id)
    except ValueError as e:
        print(f"[WARN] Cluster {cluster_id}: {e}")
