# バス到着予測モデル（Trip-based アプローチ）

## 概要
個別のバス（trip_id）単位で時系列シーケンスを作成し、上流停留所の遅延を活用

## Route-based vs Trip-based

| アプローチ | グループ化 | シーケンス | 活用できる特徴 | 期待R² |
|----------|-----------|----------|-------------|-------|
| Route-based | route_id + direction_id | 8時間 | ルート全体の遅延傾向 | 0.50-0.60 |
| **Trip-based** | trip_id | 8停留所 | **上流停留所遅延（相関0.84）** | **0.65-0.75** |

## 前提条件
- データ: `delay_analysis_improved_v2.csv`（prev_stop_delay含む）
- 特徴量グループ: `improved_feature_groups.py`

In [None]:
%load_ext autoreload
%autoreload 2

# 必要なライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

# カスタムモジュールのインポート
import sys
from pathlib import Path

rootPath = Path.cwd().parent
sys.path.append(str(rootPath))

# Trip-based SequenceCreator をインポート
from src.timeseries_processing.trip_sequence_creator import TripSequenceCreator
from src.timeseries_processing import DataSplitter, DataStandardizer
from src.timeseries_processing.improved_feature_groups import feature_groups
from src.model_training import DelayPredictionModel
from src.evaluation import ModelEvaluator, ModelVisualizer

pd.set_option('display.max_columns', None)

# データ読み込み
print("Loading improved dataset (Phase 1-3 features)...")
delay_features = pd.read_csv(f'{rootPath}/data/delay_analysis_improved_v2.csv')

print(f"Total records: {len(delay_features):,}")
print(f"Unique trips: {delay_features['trip_id'].nunique():,}")
print(f"Columns: {len(delay_features.columns)}")
print(f"\nKey features:")
print(f"  - prev_stop_delay: {(delay_features['prev_stop_delay'] != 0).sum():,} non-zero")
print(f"  - arrival_delay mean: {delay_features['arrival_delay'].mean():.2f} sec")

# GPU設定
print(f"\nTensorFlow version: {tf.__version__}")
try:
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ GPU detected: {[gpu.name for gpu in gpus]}")
    else:
        print("✓ Running on CPU")
except Exception as e:
    print(f"✓ Running on CPU (GPU setup skipped: {e})")

## 1. Trip-based シーケンス作成

個別のバス（trip_id）単位で時系列シーケンスを作成

In [None]:
# Trip-based シーケンス作成
trip_sequence_creator = TripSequenceCreator(
    input_timesteps=8,    # 入力: 8停留所
    output_timesteps=3,   # 出力: 3停留所先を予測
    feature_groups=feature_groups
)

# シーケンス作成（最小停留所数=12以上のトリップのみ使用）
X_delay, y_delay, trip_info, used_features, feature_group_info = trip_sequence_creator.create_trip_sequences(
    delay_features,
    target_col='arrival_delay',
    min_stops=12,  # 8(input) + 3(output) + 1 = 12停留所以上
    spatial_organization=True
)

print(f"\n=== Sequence Summary ===")  
print(f"Used features: {used_features}")
print(f"Sequence shapes: X={X_delay.shape}, y={y_delay.shape}")
print(f"\nTrip info:")
print(trip_info.head())

## 2. データ分割（Trip-aware）

同じトリップが訓練/テストに分かれないように分割

In [None]:
# Trip-aware分割
splitter = DataSplitter()

# trip_infoを使って分割（route_direction_infoの代わり）
# trip_idでグループ化して分割
unique_trips = trip_info['trip_id'].unique()
np.random.seed(42)
np.random.shuffle(unique_trips)

train_size = int(len(unique_trips) * 0.9)
train_trips = set(unique_trips[:train_size])
test_trips = set(unique_trips[train_size:])

# インデックス分割
train_idx = trip_info[trip_info['trip_id'].isin(train_trips)].index
test_idx = trip_info[trip_info['trip_id'].isin(test_trips)].index

X_train = X_delay[train_idx]
X_test = X_delay[test_idx]
y_train = y_delay[train_idx]
y_test = y_delay[test_idx]

print(f"=== Trip-aware Split Results ===")
print(f"Train trips: {len(train_trips):,}")
print(f"Test trips: {len(test_trips):,}")
print(f"Train sequences: {len(X_train):,}")
print(f"Test sequences: {len(X_test):,}")
print(f"Train/Test ratio: {len(X_train)/len(X_delay)*100:.1f}% / {len(X_test)/len(X_delay)*100:.1f}%")

# データリーク確認
overlap = train_trips & test_trips
print(f"\nData leak check: {len(overlap)} overlapping trips (should be 0)")

## 3. データ標準化

In [None]:
# データ標準化（特徴量のみ）
standardizer = DataStandardizer()
X_train_scaled = standardizer.fit_transform_features(X_train)
X_test_scaled = standardizer.transform_features(X_test)

print(f"=== Standardization Results ===")
print(f"X_train_scaled: {X_train_scaled.shape}")
print(f"X_test_scaled: {X_test_scaled.shape}")
print(f"\ny_train statistics (no standardization):")
print(f"  Min: {y_train.min():.2f} sec")
print(f"  Max: {y_train.max():.2f} sec")
print(f"  Mean: {y_train.mean():.2f} sec ({y_train.mean()/60:.2f} min)")
print(f"  Std: {y_train.std():.2f} sec")

## 4. ConvLSTM用にReshape

In [None]:
# ConvLSTM用reshape
actual_feature_count = X_train_scaled.shape[2]
X_train_reshaped = splitter.reshape_for_convlstm(
    X_train_scaled, target_height=1, target_width=actual_feature_count
)
X_test_reshaped = splitter.reshape_for_convlstm(
    X_test_scaled, target_height=1, target_width=actual_feature_count
)

print(f"=== ConvLSTM Reshape Results ===")
print(f"Training: {X_train_reshaped.shape} (samples, timesteps, height, width, channels)")
print(f"Test: {X_test_reshaped.shape}")
print(f"\nInterpretation:")
print(f"  - timesteps: {X_train_reshaped.shape[1]} stops")
print(f"  - width: {X_train_reshaped.shape[3]} features (spatial dimension)")

## 5. モデル構築

In [None]:
# モデル構築
INPUT_TIMESTEPS = 8
OUTPUT_TIMESTEPS = 3

model_trainer = DelayPredictionModel(
    input_timesteps=INPUT_TIMESTEPS,
    output_timesteps=OUTPUT_TIMESTEPS
)

input_shape = (INPUT_TIMESTEPS, 1, actual_feature_count, 1)
delay_model = model_trainer.build_model(input_shape)

print(f"=== Model Configuration ===")
print(f"Input shape: {input_shape}")
print(f"Feature count: {actual_feature_count}")
print(f"Output shape: (batch_size, {OUTPUT_TIMESTEPS})")
print(f"\nExpected improvement:")
print(f"  - prev_stop_delay correlation: 0.84 (extremely strong)")
print(f"  - Expected R²: 0.65-0.75")
print(f"  - Expected MAE: 70-85 seconds")

## 6. モデル訓練

In [None]:
# モデル訓練
print("=== Trip-based Model Training Started ===")

# Small sample test
print("\nTesting model with small sample...")
test_input = X_train_reshaped[:2]
test_output = y_train[:2]
test_pred = model_trainer.model.predict(test_input, verbose=0)
print(f"Small sample test successful:")
print(f"  Input: {test_input.shape}")
print(f"  Output: {test_output.shape}")
print(f"  Predicted: {test_pred.shape}")

# 訓練実行
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
model_path = f'best_delay_model_trip_based_{timestamp}.h5'

history = model_trainer.train_model(
    X_train_reshaped,
    y_train,
    batch_size=32,
    epochs=50,
    validation_split=0.2,
    model_path=model_path
)

if history is not None:
    print(f"\n✓ Training completed successfully!")
    print(f"  Model saved: {model_path}")
else:
    print("\n✗ Training failed")

## 7. モデル評価

In [None]:
# モデル評価
print("=== Trip-based Model Evaluation ===")

# 予測実行
predictions = model_trainer.predict(X_test_reshaped, batch_size=32)

print(f"Prediction shape: {predictions.shape}")
print(f"Actual shape: {y_test.shape}")

# 最終予測値
delay_pred_final = predictions[:, -1] if predictions.ndim > 1 else predictions
y_test_final = y_test[:, -1] if y_test.ndim > 1 else y_test

# 評価
evaluator = ModelEvaluator()
overall_metrics = evaluator.calculate_delay_metrics(y_test_final, delay_pred_final)
delay_level_analysis = evaluator.analyze_by_delay_level(y_test_final, delay_pred_final)

# 評価結果表示
evaluator.print_evaluation_summary(overall_metrics, delay_level_analysis)

# 可視化
visualizer = ModelVisualizer()
visualizer.plot_prediction_analysis(y_test_final, delay_pred_final, overall_metrics)

## 8. 詳細分析

In [None]:
# 詳細分析
print("=== Detailed Analysis ===")

# 遅延レベル別分析
visualizer.plot_delay_level_analysis(y_test_final, delay_pred_final, delay_level_analysis)

# 訓練履歴
if history is not None:
    visualizer.plot_training_history(history)

# モデル保存
final_model_path = f'delay_prediction_model_trip_based_final_{timestamp}.h5'
model_trainer.save_model(final_model_path)

print(f"\n=== Final Results (Trip-based) ===")
print(f"• MAE: {overall_metrics['mae']:.2f} sec ({overall_metrics['mae']/60:.2f} min)")
print(f"• RMSE: {overall_metrics['rmse']:.2f} sec ({overall_metrics['rmse']/60:.2f} min)")
print(f"• R² Score: {overall_metrics['r2']:.4f}")
print(f"• Direction accuracy: {overall_metrics['direction_accuracy']*100:.1f}%")
print(f"\n=== Expected vs Actual ===")
print(f"Expected R²: 0.65-0.75")
print(f"Actual R²: {overall_metrics['r2']:.4f}")
if overall_metrics['r2'] >= 0.65:
    print("✓ Target achieved!")
else:
    print("⚠ Below target - check prev_stop_delay utilization")