# 01. Explore Data
Load data, basic stats, weather exploration.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import seaborn as sns

# Load Data
base_df = utils.load_data()
if base_df is not None:
    print(f"Total records: {len(base_df):,}")
    print(f"Columns: {base_df.columns.tolist()}")


In [None]:
# Basic Stats
if base_df is not None:
    print(f"Routes: {sorted(base_df['route_id'].unique())}")
    print(f"Date range: {base_df['start_date'].min()} to {base_df['start_date'].max()}")

    # Filter outliers
    base_df_filtered = base_df[(base_df['arrival_delay'] >= -600) & (base_df['arrival_delay'] <= 1800)]
    base_df_filtered = base_df_filtered[base_df_filtered['alert_effect_detour'] >= 0]
    print(f"Records after filtering outliers: {len(base_df_filtered):,}")

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(base_df_filtered['arrival_delay'], bins=50, range=(-300, 900))
plt.title('Distribution of Arrival Delays')
plt.xlabel('Delay (seconds)')
plt.ylabel('Count')
plt.show()

## Timeline Metrix

In [None]:
base_df_filtered.groupby('hour_of_day')['arrival_delay'].mean().plot(kind='bar', figsize=(10, 5))
plt.title('Average Arrival Delay by Hour of Day')
plt.xticks(rotation=0)
plt.xlabel('Hour of Day')
plt.ylabel('Average Delay (seconds)')
plt.show()

In [None]:
# Map day_of_week to names (ISODOW: 1=Monday, 7=Sunday)
day_map = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}
daily_delays = base_df_filtered.groupby('day_of_week')['arrival_delay'].mean()
daily_delays.index = daily_delays.index.map(day_map)

daily_delays.plot(kind='bar', figsize=(10, 5))
plt.title('Average Arrival Delay by Day of Week')
plt.xticks(rotation=0)
plt.xlabel('Day of Week')
plt.ylabel('Average Delay (seconds)')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
pivot_data = base_df_filtered.pivot_table(index='hour_of_day', columns='day_of_week', values='arrival_delay', aggfunc='mean')

# Rename columns (ISODOW: 1=Monday, 7=Sunday)
day_map = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}
pivot_data.columns = pivot_data.columns.map(day_map)

# Reorder columns (use reindex to handle missing days)
pivot_data = pivot_data.reindex(columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

sns.heatmap(pivot_data, cmap='coolwarm', center=0, annot=False)
plt.title('Average Arrival Delay by Hour and Day')
plt.show()

## Region Metrix

In [None]:
route_path = '/workspace/GTFS/data_analysis/google_transit/trips.txt'
route_df = pd.read_csv(route_path)
route_df[['route_id', 'direction_id', 'trip_headsign']].drop_duplicates()

In [None]:
# Calculate mean delay
top_delays = base_df_filtered.groupby(['route_id', 'direction_id'])['arrival_delay'].mean().reset_index()

# Get unique route names (taking the first headsign for each route/direction combination)
route_names = route_df[['route_id', 'direction_id', 'trip_headsign']].drop_duplicates().groupby(['route_id', 'direction_id']).first().reset_index()

# Ensure route_id is string in both
top_delays['route_id'] = top_delays['route_id'].astype(str)
route_names['route_id'] = route_names['route_id'].astype(str)

# Merge
top_delays = top_delays.merge(route_names, on=['route_id', 'direction_id'], how='left')

# Create label
top_delays['label'] = top_delays.apply(lambda x: f"{x['trip_headsign']}", axis=1)

# Plot
top_delays.sort_values('arrival_delay', ascending=False).head(10).set_index('label')['arrival_delay'].plot(kind='bar', figsize=(12, 6))
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 Routes by Average Arrival Delay')
plt.xlabel('Route (Headsign)')
plt.ylabel('Average Delay (s)')
plt.show()

In [None]:
base_df_filtered.groupby('region_id')['arrival_delay'].mean().sort_values(ascending=False).plot(kind='bar', figsize=(10, 5))
plt.title('Average Arrival Delay by Region')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Region ID')
plt.ylabel('Average Delay (seconds)')
plt.show()

In [None]:
region_order = base_df_filtered.groupby('region_id')['arrival_delay'].mean().sort_values(ascending=False).index

plt.figure(figsize=(12, 6))
pivot_data = base_df_filtered.pivot_table(index='hour_of_day', columns='region_id', values='arrival_delay', aggfunc='mean')

# Reorder columns based on region_order
pivot_data = pivot_data.reindex(columns=region_order)

sns.heatmap(pivot_data, cmap='coolwarm', center=0, annot=False)
plt.title('Average Arrival Delay by Hour and Region')
plt.show()

In [None]:
import geopandas as gpd
from shapely import wkt
import folium
from folium.plugins import HeatMap
# Load regions data
regions_df = pd.read_csv('gtfs_static.regions.csv')

# Clean WKT (remove SRID prefix if present)
regions_df['boundary_clean'] = regions_df['boundary'].apply(lambda x: x.split(';')[-1] if isinstance(x, str) and ';' in x else x)

# Convert to GeoDataFrame
# Filter out rows where boundary is null or invalid if necessary
regions_df = regions_df.dropna(subset=['boundary_clean'])
regions_df['geometry'] = regions_df['boundary_clean'].apply(wkt.loads)
gdf_regions = gpd.GeoDataFrame(regions_df, geometry='geometry')

# Set CRS to EPSG:4326 (WGS 84) as indicated by SRID=4326 in the CSV
gdf_regions.set_crs(epsg=4326, inplace=True)

# Calculate mean delay by region
region_delays = base_df_filtered.groupby('region_id')['arrival_delay'].mean().reset_index()

# Merge
gdf_regions = gdf_regions.merge(region_delays, on='region_id', how='left')

# Fill NaN delays with 0 or handle them (e.g., for visualization purposes)
gdf_regions['arrival_delay'] = gdf_regions['arrival_delay'].fillna(0)

# Create Map
m_regions = folium.Map(location=[49.2827, -123.1207], zoom_start=10)

# Add Choropleth
folium.Choropleth(
    geo_data=gdf_regions,
    name='choropleth',
    data=gdf_regions,
    columns=['region_id', 'arrival_delay'],
    key_on='feature.properties.region_id',
    fill_color='YlOrRd',
    fill_opacity=0.6,
    line_opacity=0.2,
    legend_name='Average Arrival Delay (s)'
).add_to(m_regions)

# Add tooltips
folium.GeoJson(
    gdf_regions,
    tooltip=folium.GeoJsonTooltip(fields=['region_name', 'arrival_delay'], aliases=['Region:', 'Avg Delay:']),
    style_function=lambda x: {'color': 'transparent', 'fillColor': 'transparent', 'weight': 0}
).add_to(m_regions)

folium.LayerControl().add_to(m_regions)

m_regions

In [None]:
# Calculate Lat/Lon from sin/cos components
# Assuming lat_sin = sin(lat_rad), lat_cos = cos(lat_rad)
# lat_rad = arctan2(lat_sin, lat_cos)

# Group by stop_id to get unique stops and their average delay
stop_stats = base_df_filtered.groupby('stop_id').agg({
    'arrival_delay': 'mean',
    'lat_sin': 'first',
    'lat_cos': 'first',
    'lon_sin': 'first',
    'lon_cos': 'first'
}).reset_index()

# Recover Lat/Lon (in degrees)
stop_stats['stop_lat'] = np.degrees(np.arctan2(stop_stats['lat_sin'], stop_stats['lat_cos']))
stop_stats['stop_lon'] = np.degrees(np.arctan2(stop_stats['lon_sin'], stop_stats['lon_cos']))

# Create Map centered on Vancouver
vancouver_stop_map = folium.Map(location=[49.2827, -123.1207], zoom_start=11)

# Prepare data for HeatMap: [lat, lon, weight]
stop_heat_data = stop_stats[['stop_lat', 'stop_lon', 'arrival_delay']].values.tolist()

# Add HeatMap
HeatMap(stop_heat_data, radius=10, max_zoom=13).add_to(vancouver_stop_map)

vancouver_stop_map

## Alert Metrix

In [None]:
# Analyze delay difference with and without detour for each trip
# Create a binary flag for detour
base_df_filtered['has_detour'] = base_df_filtered['alert_effect_detour'] > 0

# Calculate mean delay for each trip_id, separated by detour status
# We group by route_id and direction_id as well to keep context
trip_detour_analysis = base_df_filtered.groupby(['route_id', 'direction_id', 'trip_id', 'has_detour'])['arrival_delay'].mean().unstack()

# Rename columns for clarity (False -> No Detour, True -> With Detour)
trip_detour_analysis.columns = ['No Detour', 'With Detour']

# Filter trips that have data for both conditions (to make a valid comparison)
valid_trips = trip_detour_analysis.dropna()

print(f"Number of trips with both detour and no-detour data: {len(valid_trips)}")

if len(valid_trips) > 0:
    corr = valid_trips['No Detour'].corr(valid_trips['With Detour'])

    # Visualization 1: Scatter Plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=valid_trips, x='No Detour', y='With Detour', alpha=0.6)
    
    # Add diagonal line (y=x)
    max_val = max(valid_trips['No Detour'].max(), valid_trips['With Detour'].max())
    min_val = min(valid_trips['No Detour'].min(), valid_trips['With Detour'].min())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Equal Delay')

    plt.title(f'Average Arrival Delay per Trip: With vs Without Detour (Corr: {corr:.2f})')
    plt.xlabel('Average Delay (No Detour) [s]')
    plt.ylabel('Average Delay (With Detour) [s]')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Not enough data to compare trips with and without detours (no trips have both conditions).")

In [None]:
trip_detour_analysis = base_df_filtered.groupby(['route_id', 'direction_id', 'trip_id', 'has_active_alert'])['arrival_delay'].mean().unstack()

# Rename columns for clarity (False -> No Detour, True -> With Detour)
trip_detour_analysis.columns = ['No alert', 'With alert']

# Filter trips that have data for both conditions (to make a valid comparison)
valid_trips = trip_detour_analysis.dropna()

print(f"Number of trips with both detour and no-detour data: {len(valid_trips)}")

if len(valid_trips) > 0:
    # Calculate correlation
    corr = valid_trips['No alert'].corr(valid_trips['With alert'])

    # Visualization 1: Scatter Plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=valid_trips, x='No alert', y='With alert', alpha=0.6)
    
    # Add diagonal line (y=x)
    max_val = max(valid_trips['No alert'].max(), valid_trips['With alert'].max())
    min_val = min(valid_trips['No alert'].min(), valid_trips['With alert'].min())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Equal Delay')
    
    plt.title(f'Average Arrival Delay per Trip: With vs Without Alert (Corr: {corr:.2f})')
    plt.xlabel('Average Delay (No alert) [s]')
    plt.ylabel('Average Delay (With alert) [s]')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Not enough data to compare trips with and without detours (no trips have both conditions).")