# 02. Process Data
Weather processing, merge, feature engineering.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import utils

# Load Data
base_df = utils.load_data()
if base_df is not None:
    base_df_filtered = base_df[(base_df['arrival_delay'] >= -600) & (base_df['arrival_delay'] <= 1800)]

In [None]:
base_df.isnull().sum()

同じ日のtripでも2分以上最寄りのバス停が同じパターンが存在する。始発の停車、長距離移動、終点など。加工して１つに制限する必要がある。

In [None]:
counts = base_df.groupby(['start_date', 'trip_id', 'stop_sequence']).size()
counts[counts > 1]

同じ日のtripで2分間で複数のバス停を通過するとその値は穴あきになる。それを調査する。

In [None]:
# stop_sequenceの連番に穴あきがあるかチェック
def check_sequence_gaps(df):
    """各trip（日付+trip_id）ごとにstop_sequenceの欠番をチェック"""
    gaps = []
    for (date, trip_id), group in df.groupby(['start_date', 'trip_id']):
        seqs = sorted(group['stop_sequence'].unique())
        expected = list(range(min(seqs), max(seqs) + 1))
        missing = set(expected) - set(seqs)
        if missing:
            gaps.append({
                'start_date': date,
                'trip_id': trip_id,
                'min_seq': min(seqs),
                'max_seq': max(seqs),
                'missing_seqs': sorted(missing),
                'gap_count': len(missing)
            })
    return pd.DataFrame(gaps)

gaps_df = check_sequence_gaps(base_df)
print(f"穴あきがあるtrip数: {len(gaps_df)}")
gaps_df.head(10)

In [None]:
# Prepare Trip Data (Aggregation)
def prepare_trip_data(df, direction_id=None):
    """
    Trip単位でデータを整理
    Args:
        df: 入力データフレーム
        direction_id: 方向ID（None=全方向、特定値で絞り込み）
    """
    if direction_id is not None:
        data = df[df['direction_id'] == direction_id].copy()
    else:
        data = df.copy()

    # route_id + direction_id + trip_id でユニークなtrip_keyを作成
    data['trip_key'] = (
        data['start_date'].astype(str) + '_' +
        data['route_id'].astype(str) + '_' +
        data['direction_id'].astype(str) + '_' +
        data['trip_id'].astype(str)
    )

    # route_direction_key（route_id + direction_idの組み合わせ）
    data['route_direction_key'] = (
        data['route_id'].astype(str) + '_' +
        data['direction_id'].astype(str)
    )

    # バス停タイプの判定
    trip_seq_stats = data.groupby('trip_key')['stop_sequence'].agg(['min', 'max'])
    trip_seq_stats.columns = ['seq_min', 'seq_max']
    data = data.merge(trip_seq_stats, on='trip_key', how='left')

    data['stop_type'] = 'middle'
    data.loc[data['stop_sequence'] == data['seq_min'], 'stop_type'] = 'first'
    data.loc[data['stop_sequence'] == data['seq_max'], 'stop_type'] = 'last'

    # 集計
    group_cols = ['trip_key', 'stop_sequence']
    first_stops = data[data['stop_type'] == 'first'].groupby(group_cols)['arrival_delay'].max().reset_index()
    first_stops.columns = ['trip_key', 'stop_sequence', 'arrival_delay_agg']
    last_stops = data[data['stop_type'] == 'last'].groupby(group_cols)['arrival_delay'].min().reset_index()
    last_stops.columns = ['trip_key', 'stop_sequence', 'arrival_delay_agg']
    middle_stops = data[data['stop_type'] == 'middle'].groupby(group_cols)['arrival_delay'].first().reset_index()
    middle_stops.columns = ['trip_key', 'stop_sequence', 'arrival_delay_agg']

    agg_delays = pd.concat([first_stops, middle_stops, last_stops])

    exclude_cols = ['arrival_delay', 'stop_type', 'seq_min', 'seq_max']
    other_cols = [c for c in data.columns if c not in exclude_cols and c not in group_cols]
    data_unique = data.groupby(group_cols, as_index=False)[other_cols].first()
    data_unique = data_unique.merge(agg_delays, on=['trip_key', 'stop_sequence'], how='left')
    data_unique = data_unique.sort_values(['trip_key', 'stop_sequence'])

    # 欠損値を線形補間
    data_unique['arrival_delay_agg'] = data_unique.groupby('trip_key')['arrival_delay_agg'].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )
    
    # float32に変換
    data_unique['arrival_delay_agg'] = data_unique['arrival_delay_agg'].astype('float32')

    return data_unique

if base_df is not None:
    print("Preparing trip data...")
    df_process = prepare_trip_data(base_df_filtered, direction_id=None)
    print(f"Processed trips: {df_process['trip_key'].nunique()}")

# Feature Engineering

In [None]:
def process_features(df_process):
    """Apply feature engineering"""
    scheduled_time = pd.to_datetime(df_process['scheduled_arrival_time'], utc=True)
    df_process['time_of_day'] = scheduled_time.dt.hour + scheduled_time.dt.minute / 60
    df_process['hour'] = scheduled_time.dt.hour
    df_process['time_sin'] = np.sin(2 * np.pi * df_process['time_of_day'] / 24)
    df_process['time_cos'] = np.cos(2 * np.pi * df_process['time_of_day'] / 24)
    df_process['day_of_week'] = pd.to_datetime(df_process['start_date'], format='%Y%m%d').dt.dayofweek
    df_process['is_weekend'] = (df_process['day_of_week'] >= 6).astype(int)

    # v2 features
    df_process['is_rush_hour'] = ((df_process['hour'] >= 14) & (df_process['hour'] <= 18)).astype(int)

    if 'alert_effect_detour' in df_process.columns:
        df_process['has_detour'] = (df_process['alert_effect_detour'] > 0).astype(int)
    else:
        df_process['has_detour'] = 0

    if 'alert_police_activity' in df_process.columns:
        df_process['has_police_alert'] = (df_process['alert_police_activity'] > 0).astype(int)
    else:
        df_process['has_police_alert'] = 0

    rd_encoder = LabelEncoder()
    df_process['route_direction_encoded'] = rd_encoder.fit_transform(df_process['route_direction_key'])
    
    return df_process

if base_df is not None:
    print("Applying feature engineering...")
    df_process = process_features(df_process)

    print("New features created:")
    print(f"  is_rush_hour: {df_process['is_rush_hour'].mean():.1%}")
    print(f"  has_detour: {df_process['has_detour'].mean():.1%}")
    print(f"  has_police_alert: {df_process['has_police_alert'].mean():.1%}")

In [None]:
df_process.head()

In [None]:
df_process_selected = df_process[[
    'trip_key', 'stop_sequence', 'route_id', 'trip_id', 'start_date', 'direction_id', 'stop_id', 'region_id',
       'scheduled_arrival_time', 'actual_arrival_time', 'time_bucket',
       'hour_of_day', 'day_of_week', 'has_active_alert',
       'route_direction_key', 'arrival_delay_agg',
       'time_of_day', 'hour', 'time_sin', 'time_cos', 'is_weekend',
       'is_rush_hour', 'has_detour', 'has_police_alert',
       'route_direction_encoded'
    ]]

In [None]:
# Save to CSV
if base_df is not None:
    output_file = 'data/processed_data/processed_trip_data.csv'
    df_process_selected.to_csv(output_file, index=False)
    print(f"Saved processed data to {output_file}")