In [None]:
import warnings
warnings.filterwarnings('ignore')

# カスタムモジュールのインポート
import sys
sys.path.append('/workspace/GTFS')

from src.data_connection import DatabaseConnector, GTFSDataRetriever, WeatherDataRetriever
from src.data_preprocessing import DataPreprocessor, DataAggregator, FeatureEngineer

## 1. connect to database and retrieve data

### 1.1 connect to database

In [None]:
# データベース接続設定（カスタムモジュール使用）
db_connector = DatabaseConnector()

# 接続テスト
if db_connector.test_connection():
    print("Connecting to the database succeeded")
else:
    print("Database connection failed")

In [None]:
# GTFSデータ取得（カスタムモジュール使用）
gtfs_retriever = GTFSDataRetriever(db_connector)
gtfs_data = gtfs_retriever.get_gtfs_data(route_id=[
    # '6840',
    '6618',
    # '6620',
    # '37807',
    # '6613',
    # '6617',
    # '6614',
    # '6635',
    # '6637',
], start_date='20250818')

print(f"Records retrieved: {len(gtfs_data)}")
print(f"Data period: {gtfs_data['datetime'].min()} ～ {gtfs_data['datetime'].max()}")
gtfs_data.to_csv('/workspace/GTFS/data/raw/gtfs_data_sample.csv', index=False)
gtfs_data.head()

### 1.3 retrieve weather data

In [None]:
# 気象データ取得（カスタムモジュール使用）
weather_retriever = WeatherDataRetriever(db_connector)
weather_data = weather_retriever.get_weather_data()

print(f"Retrieved record count: {len(weather_data)}")
print(f"Data period: {weather_data['datetime'].min()} ～ {weather_data['datetime'].max()}")

weather_data.to_csv('/workspace/GTFS/data/raw/weather_data_sample.csv', index=False)
weather_data.head()

## 2. データの前処理

### 2.1 check missing value

In [None]:
# データ前処理（カスタムモジュール使用）
preprocessor = DataPreprocessor()

print("=== GTFS data ===")
gtfs_missing_summary = preprocessor.show_missing_data_summary(gtfs_data)
print(gtfs_missing_summary)

print("\n=== Weather data ===")
weather_missing_summary = preprocessor.show_missing_data_summary(weather_data)
print(weather_missing_summary)

### 2.2 delete missing value and outliers

In [None]:
# 欠損値削除
filtered_gtfs_data = preprocessor.delete_missing_values(
    gtfs_data, ['travel_time_duration', 'travel_time_raw_seconds']
)

# 削除後の欠損値確認
filtered_missing_summary = preprocessor.show_missing_data_summary(filtered_gtfs_data)
print(filtered_missing_summary)

# delete outliers
filtered_gtfs_data = preprocessor.clean_gtfs_with_asymmetric_thresholds(filtered_gtfs_data)
filtered_gtfs_data = filtered_gtfs_data[~filtered_gtfs_data['should_exclude']]

print(f"=== Filter Results ===")
print(f"Delay prediction filtered data: {len(gtfs_data)} -> {len(filtered_gtfs_data)} records")

### 2.3 create feature

### 2.4 aggregate data

In [None]:
# データ集約
aggregator = DataAggregator()
feature_engineer = FeatureEngineer()

processed_gtfs_data = feature_engineer.generate_statistical_features(filtered_gtfs_data)
delay_aggregated = aggregator.create_delay_aggregation(processed_gtfs_data)
weather_aggregated = aggregator.create_weather_aggregation(weather_data)

print(f"=== Optimized Aggregation Results ===")
print(f"Delay prediction aggregated data: {len(processed_gtfs_data)} -> {len(delay_aggregated)} records")
print(f"Weather aggregated data: {len(weather_data)} -> {len(weather_aggregated)} records")

print(f"\n=== Aggregated Data Quality Check ===")
print(f"Delay aggregation features: {delay_aggregated.columns.tolist()}")
print(f"Weather aggregation features: {weather_aggregated.columns.tolist()}")

### 2.5 merge gtfs and weather data

In [None]:
delay_features = feature_engineer.generate_time_features(delay_aggregated)
merged_data = feature_engineer.merge_features(delay_features, weather_aggregated)

# 結合後のデータ品質チェック
print(f"\n=== 特徴量結合後の分析 ===")
print(f"結合前 - 遅延データ: {len(delay_aggregated)} レコード")
print(f"結合前 - 気象データ: {len(weather_aggregated)} レコード")
print(f"結合後:             {len(merged_data)} レコード")

merged_data.head()

In [None]:
merged_data.to_csv('/workspace/GTFS/data/merged_dataset.csv', index=False)