In [None]:
import pandas as pd
import sys
from pathlib import Path
# カスタムモジュールのインポート

rootPath = Path.cwd().parent
sys.path.append(str(rootPath))

from src.timeseries_processing import SequenceCreator, DataSplitter, DataStandardizer
from src.data_connection import DatabaseConnector

db_connector = DatabaseConnector()
delay_features = pd.read_csv(f"{rootPath}/data/merged_dataset.csv")

In [None]:
# with db_connector.get_connection() as conn:
#     conn.autocommit = True
#     with conn.cursor() as cur:
#         cur.execute("CALL gtfs_realtime.refresh_gtfs_views_staged()")
#         cur.execute("CALL gtfs_realtime.refresh_regional_views();")

In [None]:
# ConvLSTM用の特徴量グループ定義（カスタマイズ可能）
feature_groups = {
    'temporal': ['hour_sin', 'hour_cos', 'day_sin', 'day_cos','is_peak_hour', 'is_weekend', 'arrival_delay'],
    'region': ['region_id_encoded', 'area_type_encoded', 'distance_from_downtown_km', 'distance_from_downtown_km'],
    'weather': ['weather_sunny','weather_cloudy','weather_rainy', 'temp', 'precipitation'],
    'target': ['arrival_delay']
}

# 時系列シーケンス作成（カスタムモジュール使用）
sequence_creator = SequenceCreator(
    input_timesteps=8, 
    output_timesteps=3,
    feature_groups=feature_groups  # カスタムfeature_groupsを指定
)

# 空間的配置を有効にしてシーケンス作成（feature_colsとtarget_colは自動生成）
X, y, route_direction_info, used_features, feature_group_info = sequence_creator.create_route_direction_aware_sequences(
    delay_features,
    spatial_organization=True  # ConvLSTM用の空間配置を有効化
)

print(f"使用された特徴量: {used_features}")
print(f"シーケンス形状: X={X.shape}, y={y.shape}")

# 特徴量グループの表示
if feature_group_info:
    print(f"\n=== ConvLSTM Feature Groups (Width Dimension) ===")
    for group_name, info in feature_group_info.items():
        print(f"{group_name.capitalize()}: {info['features']} -> width indices [{info['start_idx']}:{info['end_idx']}]")
        print(f"  Size: {info['size']} features")

# データ標準化（特徴量のみ）
standardizer = DataStandardizer()
X_train_scaled = standardizer.fit_transform_features(X)

splitter = DataSplitter()
actual_feature_count = X_train_scaled.shape[2]
X_reshaped = splitter.reshape_for_convlstm(
    X_train_scaled, target_height=1, target_width=actual_feature_count
) 

print(f"\n=== ConvLSTM Reshape Results ===")
print(f"Training data: {X_reshaped.shape} (samples, timesteps, height, width, channels)")

In [None]:
# モデルと評価クラスのインポート
import tensorflow as tf
from tensorflow import keras
from src.evaluation.model_evaluator import ModelEvaluator
import numpy as np

# モデルの読み込み
model_path = f'{rootPath}/files/model/delay_prediction_final_region.h5'
# 古いKeras形式のメトリクスに対応するためのカスタムオブジェクトを定義
custom_objects = {
	'mse': tf.keras.metrics.MeanSquaredError(),
	'mae': tf.keras.metrics.MeanAbsoluteError()
}
model = keras.models.load_model(model_path, custom_objects=custom_objects)

print(f"モデルを読み込みました: {model_path}")

In [None]:
# 予測実行
y_pred = model.predict(X_reshaped).flatten()

In [None]:
# 評価器の初期化
evaluator = ModelEvaluator()

# 全体の評価指標を計算
overall_metrics = evaluator.calculate_delay_metrics(y.flatten(), y_pred)
delay_level_analysis = evaluator.analyze_by_delay_level(y.flatten(), y_pred)

# 結果を表示
evaluator.print_evaluation_summary(overall_metrics, delay_level_analysis)

In [None]:
# 地域ごとの予測精度を評価
regions = delay_features['region_id'].unique()
region_results = []

print("=" * 80)
print("地域ごとの予測精度評価")
print("=" * 80)

for region in regions:
    # 地域データの抽出
    region_mask = delay_features['region_id'] == region
    region_data = delay_features[region_mask]
    
    if len(region_data) < 10:  # サンプル数が少ない地域はスキップ
        continue
    
    # 予測実行
    X, y, route_direction_info, used_features, feature_group_info = sequence_creator.create_route_direction_aware_sequences(
        delay_features,
        spatial_organization=True  # ConvLSTM用の空間配置を有効化
    )
    standardizer = DataStandardizer()
    X_scaled = standardizer.fit_transform_features(X)

    splitter = DataSplitter()
    actual_feature_count = X_scaled.shape[2]
    X_reshaped = splitter.reshape_for_convlstm(
        X_scaled, target_height=1, target_width=actual_feature_count
    ) 
    y_pred_region = model.predict(X_reshaped, verbose=0).flatten()

    # 評価指標の計算
    metrics = evaluator.calculate_delay_metrics(y.flatten(), y_pred_region)

    # 結果を保存
    region_name = region_data['region_id'].iloc[0]
    region_type = region_data['area_type'].iloc[0]
    
    region_results.append({
        'region_id': region,
        'area_type': region_type,
        'sample_count': len(region_data),
        'mae': metrics['mae'],
        'rmse': metrics['rmse'],
        'r2': metrics['r2'],
        'direction_accuracy': metrics['direction_accuracy'],
        'ontime_accuracy': metrics['ontime_accuracy'],
        'within_1min': metrics['range_accuracies']['Within 1min'],
        'within_2min': metrics['range_accuracies']['Within 2min'],
        'within_5min': metrics['range_accuracies']['Within 5min']
    })
    
    # 各地域の詳細結果を表示
    print(f"\n【{region_name} ({region_type})】")
    print(f"  サンプル数: {len(region_data)}")
    print(f"  MAE: {metrics['mae']:.2f} 秒")
    print(f"  RMSE: {metrics['rmse']:.2f} 秒")
    print(f"  R²: {metrics['r2']:.3f}")
    print(f"  方向予測精度: {metrics['direction_accuracy']*100:.1f}%")
    print(f"  定時予測精度: {metrics['ontime_accuracy']*100:.1f}%")
    print(f"  1分以内精度: {metrics['range_accuracies']['Within 1min']*100:.1f}%")
    print(f"  2分以内精度: {metrics['range_accuracies']['Within 2min']*100:.1f}%")

# 結果をDataFrameに変換
region_results_df = pd.DataFrame(region_results)
region_results_df = region_results_df.sort_values('mae')

print("\n" + "=" * 80)

In [None]:
# 地域別精度サマリー（MAEでソート）
print("\n地域別予測精度サマリー（MAEの低い順）")
print("=" * 120)
print(f"{'Region Name':<30} {'Type':<12} {'Count':<8} {'MAE':<8} {'RMSE':<8} {'R²':<8} {'Dir%':<8} {'1min%':<8}")
print("-" * 120)

for _, row in region_results_df.iterrows():
    print(f"{row['region_id']:<30} {row['area_type']:<12} {row['sample_count']:<8} "
          f"{row['mae']:<7.1f} {row['rmse']:<7.1f} {row['r2']:<7.3f} "
          f"{row['direction_accuracy']*100:<7.1f} {row['within_1min']*100:<7.1f}")

print("=" * 120)

In [None]:
# 地域タイプ別の集計
print("\n地域タイプ別の平均精度")
print("=" * 80)

type_summary = region_results_df.groupby('area_type').agg({
    'sample_count': 'sum',
    'mae': 'mean',
    'rmse': 'mean',
    'r2': 'mean',
    'direction_accuracy': 'mean',
    'within_1min': 'mean',
    'within_2min': 'mean'
}).round(3)

type_summary['region_count'] = region_results_df.groupby('area_type').size()

print(type_summary)
print("=" * 80)

In [None]:
# 可視化: 地域別MAEの比較
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. 地域別MAE（上位20地域）
top_20 = region_results_df.nsmallest(20, 'mae')
ax1 = axes[0, 0]
ax1.barh(range(len(top_20)), top_20['mae'], color='steelblue')
ax1.set_yticks(range(len(top_20)))
ax1.set_yticklabels(top_20['region_id'], fontsize=8)
ax1.set_xlabel('MAE (seconds)')
ax1.set_title('Top 20 Areas by MAE (Lower is Better)')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

# 2. 地域タイプ別MAE
ax2 = axes[0, 1]
type_mae = region_results_df.groupby('area_type')['mae'].mean().sort_values()
ax2.bar(range(len(type_mae)), type_mae.values, color='coral')
ax2.set_xticks(range(len(type_mae)))
ax2.set_xticklabels(type_mae.index, rotation=45, ha='right')
ax2.set_ylabel('Average MAE (seconds)')
ax2.set_title('Average MAE by Area Type')
ax2.grid(axis='y', alpha=0.3)

# 3. MAE vs サンプル数の散布図
ax3 = axes[1, 0]
scatter = ax3.scatter(region_results_df['sample_count'], region_results_df['mae'], 
                     c=region_results_df['r2'], cmap='viridis', s=100, alpha=0.6)
ax3.set_xlabel('Sample Count')
ax3.set_ylabel('MAE (seconds)')
ax3.set_title('MAE vs Sample Count (colored by R²)')
ax3.grid(alpha=0.3)
plt.colorbar(scatter, ax=ax3, label='R² Score')

# 4. 1分以内精度の分布
ax4 = axes[1, 1]
ax4.hist(region_results_df['within_1min'] * 100, bins=20, color='lightgreen', edgecolor='black')
ax4.set_xlabel('Accuracy within 1 minute (%)')
ax4.set_ylabel('Number of Regions')
ax4.set_title('Distribution of 1-Minute Accuracy Across Regions')
ax4.axvline(region_results_df['within_1min'].mean() * 100, color='red', 
            linestyle='--', linewidth=2, label=f'Mean: {region_results_df["within_1min"].mean()*100:.1f}%')
ax4.legend()
ax4.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 精度が高い地域と低い地域のトップ5
print("\n最も精度が高い地域（MAEが低い）トップ5")
print("=" * 120)
best_regions = region_results_df.nsmallest(5, 'mae')
for idx, row in best_regions.iterrows():
    print(f"\n{row['region_id']} ({row['area_type']})")
    print(f"  サンプル数: {row['sample_count']}")
    print(f"  MAE: {row['mae']:.2f}秒, RMSE: {row['rmse']:.2f}秒, R²: {row['r2']:.3f}")
    print(f"  1分以内精度: {row['within_1min']*100:.1f}%, 方向精度: {row['direction_accuracy']*100:.1f}%")

print("\n" + "=" * 120)
print("\n最も精度が低い地域（MAEが高い）トップ5")
print("=" * 120)
worst_regions = region_results_df.nlargest(5, 'mae')
for idx, row in worst_regions.iterrows():
    print(f"\n{row['region_id']} ({row['area_type']})")
    print(f"  サンプル数: {row['sample_count']}")
    print(f"  MAE: {row['mae']:.2f}秒, RMSE: {row['rmse']:.2f}秒, R²: {row['r2']:.3f}")
    print(f"  1分以内精度: {row['within_1min']*100:.1f}%, 方向精度: {row['direction_accuracy']*100:.1f}%")

print("=" * 120)

In [None]:
# 結果をDataFrameとして保存（後続の分析用）
print("\n地域別評価結果のDataFrame:")
print(f"Total Regions: {len(region_results_df)}")
print(f"\nColumns: {region_results_df.columns.tolist()}")
print(f"\n統計サマリー:")
print(region_results_df[['mae', 'rmse', 'r2', 'direction_accuracy', 'within_1min', 'within_2min']].describe())