In [1]:
import warnings
warnings.filterwarnings('ignore')

# カスタムモジュールのインポート
import sys
sys.path.append('/workspace/GTFS')

from src.data_connection import DatabaseConnector, GTFSDataRetriever, WeatherDataRetriever
from src.data_preprocessing import DataPreprocessor, DataAggregator, FeatureEngineer

2025-09-21 01:06:57.917983: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-21 01:06:57.918543: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-21 01:06:57.999651: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-21 01:07:00.491969: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

## 1. connect to database and retrieve data

### 1.1 connect to database

In [2]:
# データベース接続設定（カスタムモジュール使用）
db_connector = DatabaseConnector()

# 接続テスト
if db_connector.test_connection():
    print("Connecting to the database succeeded")
else:
    print("Database connection failed")

Database connection successful
Environment: production
Database: db5q08f70eh8ap
Host: c80eji844tr0op.cluster-czrs8kj4isg7.us-east-1.rds.amazonaws.com
Connecting to the database succeeded


In [13]:
# GTFSデータ取得（カスタムモジュール使用）
gtfs_retriever = GTFSDataRetriever(db_connector)
gtfs_data = gtfs_retriever.get_gtfs_data(route_id='6612', start_date='20250818')

print(f"Records retrieved: {len(gtfs_data)}")
print(f"Data period: {gtfs_data['datetime'].min()} ～ {gtfs_data['datetime'].max()}")
gtfs_data.to_csv('/workspace/GTFS/data/raw/gtfs_data_sample.csv', index=False)
gtfs_data.head()

Retrieving Vancouver delay prediction GTFS data for route 6612...
Records retrieved: 22669
Data period: 2025-09-14 01:22:30-07:00 ～ 2025-09-20 18:28:20-07:00


Unnamed: 0,datetime,day_of_week,line_direction_link_order,trip_id,stop_id,start_date,route_id,direction_id,travel_time_raw_seconds,arrival_delay,hour_of_day,datetime_60,time_period_basic,travel_time_duration
0,2025-09-14 06:28:36-07:00,7.0,16,14672431,61,20250914,6612,0,,27,6.0,2025-09-14 06:00:00-07:00,1,
1,2025-09-14 06:29:56-07:00,7.0,17,14672431,63,20250914,6612,0,80.0,61,6.0,2025-09-14 06:00:00-07:00,1,80.0
2,2025-09-14 06:30:13-07:00,7.0,18,14672431,64,20250914,6612,0,17.0,57,6.0,2025-09-14 06:00:00-07:00,1,17.0
3,2025-09-14 06:31:02-07:00,7.0,19,14672431,12040,20250914,6612,0,49.0,42,6.0,2025-09-14 06:00:00-07:00,1,49.0
4,2025-09-14 06:31:44-07:00,7.0,20,14672431,10542,20250914,6612,0,42.0,37,6.0,2025-09-14 06:00:00-07:00,1,42.0


### 1.3 retrieve weather data

In [4]:
# 気象データ取得（カスタムモジュール使用）
weather_retriever = WeatherDataRetriever(db_connector)
weather_data = weather_retriever.get_weather_data()

print(f"Retrieved record count: {len(weather_data)}")
print(f"Data period: {weather_data['datetime'].min()} ～ {weather_data['datetime'].max()}")

weather_data.to_csv('/workspace/GTFS/data/raw/weather_data_sample.csv', index=False)
weather_data.head()

Retrieving weather data...
Retrieved record count: 10949
Data period: 2024-06-20 23:00:00-07:00 ～ 2025-09-20 03:00:00-07:00


Unnamed: 0,datetime,temp,relative_humidity,pressure_sea,wind_speed,visibility,humidex,cloud_cover_8,weather_sunny,weather_cloudy,weather_rainy,precipitation
0,2024-06-20 23:00:00-07:00,16.7,84.0,101.46,3,24100.0,20.13,3.0,0,1,0,0.0
1,2024-06-21 00:00:00-07:00,16.0,90.0,101.46,3,24100.0,19.61,2.0,1,0,0,0.0
2,2024-06-21 01:00:00-07:00,16.9,81.0,101.45,7,24100.0,20.1,1.0,1,0,0,0.0
3,2024-06-21 02:00:00-07:00,15.8,61.0,101.46,6,19300.0,16.38,1.0,1,0,0,0.0
4,2024-06-21 03:00:00-07:00,15.5,68.0,101.47,4,19300.0,16.65,1.0,1,0,0,0.0


## 2. データの前処理

### 2.1 check missing value

In [5]:
# データ前処理（カスタムモジュール使用）
preprocessor = DataPreprocessor()

print("=== GTFS data ===")
gtfs_missing_summary = preprocessor.show_missing_data_summary(gtfs_data)
print(gtfs_missing_summary)

print("\n=== Weather data ===")
weather_missing_summary = preprocessor.show_missing_data_summary(weather_data)
print(weather_missing_summary)

=== GTFS data ===
                           Missing Count Missing Percentage
datetime                               0               0.0%
day_of_week                            0               0.0%
line_direction_link_order              0               0.0%
trip_id                                0               0.0%
stop_id                                0               0.0%
start_date                             0               0.0%
route_id                               0               0.0%
direction_id                           0               0.0%
travel_time_raw_seconds             1210               5.5%
arrival_delay                          0               0.0%
hour_of_day                            0               0.0%
datetime_60                            0               0.0%
time_period_basic                      0               0.0%
travel_time_duration                1210               5.5%

=== Weather data ===
                   Missing Count Missing Percentage
datetime

### 2.2 delete missing value and outliers

In [6]:
# 欠損値削除（カスタムモジュール使用）
filtered_gtfs_data = preprocessor.delete_missing_values(
    gtfs_data, ['travel_time_duration', 'travel_time_raw_seconds']
)

# 削除後の欠損値確認
filtered_missing_summary = preprocessor.show_missing_data_summary(filtered_gtfs_data)
print(filtered_missing_summary)

# delete outliers
# filtered_gtfs_data = preprocessor.remove_outliers_mad(filtered_gtfs_data)

                           Missing Count Missing Percentage
datetime                               0               0.0%
day_of_week                            0               0.0%
line_direction_link_order              0               0.0%
trip_id                                0               0.0%
stop_id                                0               0.0%
start_date                             0               0.0%
route_id                               0               0.0%
direction_id                           0               0.0%
travel_time_raw_seconds                0               0.0%
arrival_delay                          0               0.0%
hour_of_day                            0               0.0%
datetime_60                            0               0.0%
time_period_basic                      0               0.0%
travel_time_duration                   0               0.0%


### 2.3 create feature

In [7]:
# 高度な前処理の実行（カスタムモジュール使用）
print("Applying sophisticated preprocessing to GTFS data...")
processed_gtfs_data = filtered_gtfs_data.copy()

processed_gtfs_data = preprocessor.generate_time_features(processed_gtfs_data)
processed_gtfs_data = preprocessor.generate_statistical_features(processed_gtfs_data)

Applying sophisticated preprocessing to GTFS data...


### 2.4 aggregate data

In [8]:
# データ集約
aggregator = DataAggregator()

delay_aggregated = aggregator.create_delay_aggregation(processed_gtfs_data)
weather_aggregated = aggregator.create_weather_aggregation(weather_data)

print(f"=== Optimized Aggregation Results ===")
print(f"Delay prediction aggregated data: {len(processed_gtfs_data)} -> {len(delay_aggregated)} records")
print(f"Weather aggregated data: {len(weather_data)} -> {len(weather_aggregated)} records")

print(f"\n=== Aggregated Data Quality Check ===")
print(f"Delay aggregation features: {delay_aggregated.columns.tolist()}")
print(f"Weather aggregation features: {weather_aggregated.columns.tolist()}")

=== Optimized Aggregation Results ===
Delay prediction aggregated data: 20955 -> 6226 records
Weather aggregated data: 10949 -> 10949 records

=== Aggregated Data Quality Check ===
Delay aggregation features: ['route_id', 'direction_id', 'stop_id', 'line_direction_link_order', 'time_bucket', 'day_of_week', 'arrival_delay', 'observation_count', 'travel_time_duration', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'delay_mean_by_route_hour', 'delay_deviation']
Weather aggregation features: ['time_bucket', 'temp', 'precipitation', 'humidex', 'wind_speed', 'weather_sunny', 'weather_cloudy', 'weather_rainy']


### 2.5 merge gtfs and weather data

In [9]:
feature_engineer = FeatureEngineer()

delay_features = feature_engineer.merge_features(delay_aggregated, weather_aggregated)

delay_features.head()


=== 特徴量結合後の分析 ===
結合前 - 遅延データ: 6226 レコード
結合前 - 気象データ: 10949 レコード
結合後:             5739 レコード


Unnamed: 0,route_id,direction_id,stop_id,line_direction_link_order,time_bucket,day_of_week,arrival_delay,observation_count,travel_time_duration,hour_sin,...,day_cos,delay_mean_by_route_hour,delay_deviation,temp,precipitation,humidex,wind_speed,weather_sunny,weather_cloudy,weather_rainy
0,6612,0,10542,10,2025-09-15 15:00:00-07:00,1.0,-145.0,2,55.0,-0.707107,...,0.62349,-268.263,123.263,18.5,0.0,21.3,14.0,1.0,0.0,0.0
1,6612,0,10542,10,2025-09-17 11:00:00-07:00,3.0,-352.0,2,57.5,0.258819,...,-0.900969,-351.273,-0.727,20.3,0.0,23.38,9.0,0.0,0.0,1.0
2,6612,0,10542,10,2025-09-18 15:00:00-07:00,4.0,-138.5,2,54.5,-0.707107,...,-0.900969,-268.263,129.763,18.1,0.0,21.24,25.0,1.0,0.0,0.0
3,6612,0,10542,20,2025-09-14 06:00:00-07:00,7.0,18.5,2,46.5,1.0,...,1.0,-305.543,324.043,16.8,0.5,20.06,17.0,0.0,0.0,1.0
4,6612,0,10542,20,2025-09-14 07:00:00-07:00,7.0,-216.666667,3,42.666667,0.965926,...,1.0,-313.266,96.599333,16.6,0.5,19.86,17.0,0.0,0.0,1.0


In [10]:
delay_features.to_csv('/workspace/GTFS/data/merged_dataset.csv', index=False)