# 03. Model Baseline
Baseline models implementation.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import utils
import os

# Load Processed Data
processed_file = 'data/processed_data/processed_trip_data.csv'
if os.path.exists(processed_file):
    df_process = pd.read_csv(processed_file)
    df_process['start_date'] = df_process['start_date'].astype(str)
    print(f"Loaded processed data from {processed_file}")
    
    # Split
    unique_dates = sorted(df_process['start_date'].unique())
    split_idx = int(len(unique_dates) * 0.8)
    train_dates = unique_dates[:split_idx]
    test_dates = unique_dates[split_idx:]

    df_train = df_process[df_process['start_date'].isin(train_dates)].copy()
    df_test = df_process[df_process['start_date'].isin(test_dates)].copy()
else:
    print(f"File not found: {processed_file}. Please run 02_process_data.ipynb first.")
    df_process = None

In [None]:
def create_trip_based_sequences_multi_route(df, n_past_trips=5, stops_dict=None):
    all_X_delays = []
    all_X_features = []
    all_X_agg = []  # 集約特徴量
    all_y = []
    all_meta = []

    feature_cols = [
       'hour_of_day'
       , 'arrival_delay_agg'
       , 'day_of_week'
       , 'time_of_day'
       , 'time_sin'
       , 'time_cos'
       , 'is_weekend'
       , 'is_rush_hour'
       , 'has_active_alert'
       , 'has_detour'
       , 'has_police_alert'
    ]

    # route_direction_keyごとに処理
    rd_keys = sorted(df['route_direction_key'].unique())
    print(f"Processing {len(rd_keys)} route-direction combinations")

    if stops_dict is None:
        stops_dict = {}
        for rd_key in rd_keys:
            rd_df = df[df['route_direction_key'] == rd_key]
            stops_dict[rd_key] = sorted(rd_df['stop_sequence'].unique())

    # 全route-directionで共通のstops数を使用（パディング用）
    max_stops = max(len(stops) for stops in stops_dict.values())
    print(f"Max stops across all route-directions: {max_stops}")

    for rd_key in rd_keys:
        rd_df = df[df['route_direction_key'] == rd_key].copy()
        stops = stops_dict.get(rd_key, sorted(rd_df['stop_sequence'].unique()))
        n_stops = len(stops)

        # Trip単位で時間順にソート
        trip_order = rd_df.groupby('trip_key')['scheduled_arrival_time'].min().sort_values().index.tolist()

        if len(trip_order) <= n_past_trips:
            # print(f"  {rd_key}: Not enough trips ({len(trip_order)}), skipping")
            continue

        # 1. 遅延パターン (trip x stop)
        delay_pivot = rd_df.pivot_table(
            index='trip_key', columns='stop_sequence',
            values='arrival_delay_agg', aggfunc='first'
        )
        delay_pivot = delay_pivot.reindex(index=trip_order, columns=stops).ffill(axis=1).fillna(0)

        # stopsを共通サイズにパディング
        if n_stops < max_stops:
            padding = np.zeros((len(delay_pivot), max_stops - n_stops))
            delay_values = np.concatenate([delay_pivot.values, padding], axis=1)
        else:
            delay_values = delay_pivot.values

        # 2. 時間・天候・アラート特徴量 + route_direction_encoded
        trip_features = rd_df.groupby('trip_key')[feature_cols + ['route_direction_encoded']].first()
        trip_features = trip_features.reindex(index=trip_order).fillna(0)

        # シーケンス作成
        for i in range(n_past_trips, len(trip_order)):
            # 過去N便の遅延パターン（同じroute+directionのみ）
            past_delays = delay_values[i-n_past_trips:i]  # (n_past_trips, max_stops)

            # 予測対象便の特徴量
            target_features = trip_features.iloc[i].values  # (n_features,)

            # ★ 集約特徴量 ★
            past_mean = past_delays.mean()
            past_std = past_delays.std()
            past_trend = past_delays[-1].mean() - past_delays[0].mean()
            past_max = past_delays.max()
            agg_features = np.array([past_mean, past_std, past_trend, past_max])

            # 予測対象の遅延
            target_delay = delay_values[i]  # (max_stops,)

            all_X_delays.append(past_delays)
            all_X_features.append(target_features)
            all_X_agg.append(agg_features)
            all_y.append(target_delay)
            all_meta.append(trip_order[i])

    X_delays = np.array(all_X_delays)  # (N, n_past_trips, max_stops)
    X_features = np.array(all_X_features)  # (N, n_features)
    X_agg = np.array(all_X_agg)  # (N, 4)
    y = np.array(all_y)  # (N, max_stops)

    print(f"\nTotal X_delays shape: {X_delays.shape}")
    print(f"Total X_features shape: {X_features.shape}")
    print(f"Total X_agg shape: {X_agg.shape}")
    print(f"Total y shape: {y.shape}")

    return X_delays, X_features, X_agg, y, all_meta, stops_dict, max_stops

In [None]:
# Create Sequences
if df_process is not None:
    n_past_trips = 5
    stops_dict = {}
    for rd_key in df_process['route_direction_key'].unique():
        rd_df = df_process[df_process['route_direction_key'] == rd_key]
        stops_dict[rd_key] = sorted(rd_df['stop_sequence'].unique())

    print("Creating sequences...")
    X_delays_test, X_features_test, X_agg_test, y_test, meta_test, _, _ = create_trip_based_sequences_multi_route(
        df_test, n_past_trips, stops_dict=stops_dict
    )
    # We need train sequences for Baseline 3 (history average)
    X_delays_train, _, _, _, _, _, _ = create_trip_based_sequences_multi_route(
        df_train, n_past_trips, stops_dict=stops_dict
    )

In [None]:
# Baseline 1: Last Trip Delay
if df_process is not None:
    y_pred_baseline1 = X_delays_test[:, -1, :]
    mae_bl1 = mean_absolute_error(y_test.flatten(), y_pred_baseline1.flatten())
    r2_bl1 = r2_score(y_test.flatten(), y_pred_baseline1.flatten())

    print(f"Baseline 1 (Last Trip) MAE: {mae_bl1:.2f}, R2: {r2_bl1:.4f}")

In [None]:
# Baseline 2: Mean of Past N Trips
if df_process is not None:
    y_pred_baseline2 = X_delays_test.mean(axis=1)
    mae_bl2 = mean_absolute_error(y_test.flatten(), y_pred_baseline2.flatten())
    r2_bl2 = r2_score(y_test.flatten(), y_pred_baseline2.flatten())

    print(f"Baseline 2 (Mean Past N) MAE: {mae_bl2:.2f}, R2: {r2_bl2:.4f}")