In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# カスタムモジュールのインポート
import sys
sys.path.append('/app')

from src.data_connection import DatabaseConnector, WeatherDataRetriever
from src.data_connection.gtfs_data_retriever_v2 import GTFSDataRetrieverV2
from src.data_preprocessing import DataPreprocessor, DataAggregator, FeatureEngineer

## 1. connect to database and retrieve data

### 1.1 connect to database

In [None]:
# データベース接続設定（カスタムモジュール使用）
db_connector = DatabaseConnector()

# 接続テスト
if db_connector.test_connection():
    print("Connecting to the database succeeded")
else:
    print("Database connection failed")

In [None]:
# GTFSデータ取得（GTFSDataRetrieverV2 + Analytics MVを使用）
gtfs_retriever = GTFSDataRetrieverV2(db_connector)

# use_analytics_mv=True でAnalytics MVから取得（統計特徴量・時系列特徴量が事前計算済み）
gtfs_data = gtfs_retriever.get_gtfs_data(
    route_id=[
        # '6840',
        '6618',
        # '6620',
        # '37807',
        # '6613',
        # '6617',
        # '6614',
        # '6635',
        # '6637',
    ], 
    start_date='20250818',
    use_analytics_mv=True  # Analytics MVを使用（完全に処理済み）
)

print(f"Records retrieved: {len(gtfs_data)}")
print(f"Data period: {gtfs_data['datetime'].min()} ～ {gtfs_data['datetime'].max()}")
print(f"Available columns: {gtfs_data.columns.tolist()}")

gtfs_data.to_csv('/app/data/raw/gtfs_data_sample.csv', index=False)
gtfs_data.head()

### 1.3 retrieve weather data

In [None]:
# 気象データ取得（カスタムモジュール使用）
weather_retriever = WeatherDataRetriever(db_connector)
weather_data = weather_retriever.get_weather_data()

print(f"Retrieved record count: {len(weather_data)}")
print(f"Data period: {weather_data['datetime'].min()} ～ {weather_data['datetime'].max()}")

weather_data.to_csv('/app/data/raw/weather_data_sample.csv', index=False)
weather_data.head()

## 2. データの前処理

### 2.1 check missing value

In [None]:
# データ前処理（カスタムモジュール使用）
preprocessor = DataPreprocessor()

print("=== GTFS data ===")
gtfs_missing_summary = preprocessor.show_missing_data_summary(gtfs_data)
print(gtfs_missing_summary)

print("\n=== Weather data ===")
weather_missing_summary = preprocessor.show_missing_data_summary(weather_data)
print(weather_missing_summary)

### 2.2 delete missing value and outliers

In [None]:
# 欠損値削除
# Analytics MVでは既に travel_time_duration と基本的なフィルタリングが適用済み
filtered_gtfs_data = preprocessor.delete_missing_values(
    gtfs_data, ['travel_time_duration']
)

# 削除後の欠損値確認
filtered_missing_summary = preprocessor.show_missing_data_summary(filtered_gtfs_data)
print(filtered_missing_summary)

# delete outliers (Analytics MVでは既に基本的なフィルタリング済み)
# 必要に応じて追加の非対称閾値フィルタリングを適用
filtered_gtfs_data = preprocessor.clean_gtfs_with_asymmetric_thresholds(filtered_gtfs_data)
filtered_gtfs_data = filtered_gtfs_data[~filtered_gtfs_data['should_exclude']].copy()

print(f"=== Filter Results ===")
print(f"Delay prediction filtered data: {len(gtfs_data)} -> {len(filtered_gtfs_data)} records")

### 2.3 create feature

### 2.4 aggregate data

In [None]:
# データ集約
aggregator = DataAggregator()
feature_engineer = FeatureEngineer()

# Analytics MVでは統計特徴量が事前計算済みのため、generate_statistical_features()は不要
# delay_mean_by_route_hour, travel_mean_by_route_hour が既に含まれている

# 遅延データの集約（60分タイムバケット）
# Analytics MVのデータには既に統計特徴量と地理的特徴量が含まれている
delay_aggregated = aggregator.create_delay_aggregation(filtered_gtfs_data)

# 気象データの集約
weather_aggregated = aggregator.create_weather_aggregation(weather_data)

print(f"=== Optimized Aggregation Results (Analytics MV) ===")
print(f"Delay prediction aggregated data: {len(filtered_gtfs_data)} -> {len(delay_aggregated)} records")
print(f"Weather aggregated data: {len(weather_data)} -> {len(weather_aggregated)} records")

print(f"\n=== Aggregated Data Quality Check ===")
print(f"Delay aggregation features: {delay_aggregated.columns.tolist()}")
print(f"Weather aggregation features: {weather_aggregated.columns.tolist()}")

# 統計特徴量の存在確認
if 'delay_mean_by_route_hour' in delay_aggregated.columns:
    print("\n✓ Statistical features (delay_mean_by_route_hour) are present from Analytics MV")
if 'travel_mean_by_route_hour' in delay_aggregated.columns:
    print("✓ Statistical features (travel_mean_by_route_hour) are present from Analytics MV")

# 地理的特徴量の存在確認とサマリー表示
if 'region_id' in delay_aggregated.columns:
    print("✓ Geographic features (region_id, lat/lon, area_type) are present from Analytics MV")
    feature_engineer.get_geographic_feature_summary(delay_aggregated)

### 2.5 merge gtfs and weather data

In [None]:
# 時系列特徴量の生成（Analytics MVには一部含まれるが、集約データ用に再生成）
delay_features = feature_engineer.generate_time_features(delay_aggregated)

# 地理的特徴量のエンコーディング（region_id, area_typeをラベルエンコーディング）
delay_features = feature_engineer.encode_geographic_features(delay_features, fit=True)

# GTFSと気象データの結合
merged_data = feature_engineer.merge_features(delay_features, weather_aggregated)

# 結合後のデータ品質チェック
print(f"\n=== Feature Merge Analysis ===")
print(f"Before merge - Delay data: {len(delay_aggregated)} records")
print(f"Before merge - Weather data: {len(weather_aggregated)} records")
print(f"After merge:                {len(merged_data)} records")

print(f"\n=== Final Feature Columns ===")
print(f"Total columns: {len(merged_data.columns)}")
print(f"Columns: {merged_data.columns.tolist()}")

# 地理的特徴量のエンコーディング確認
if 'region_id_encoded' in merged_data.columns:
    print(f"\n✓ region_id encoded: {merged_data['region_id_encoded'].nunique()} unique values")
if 'area_type_encoded' in merged_data.columns:
    print(f"✓ area_type encoded: {merged_data['area_type_encoded'].nunique()} unique values")

merged_data.head()

In [None]:
merged_data.to_csv('/app/data/merged_dataset.csv', index=False)