# 🚊 GTFS Transit Analysis Dashboard

## Comprehensive Analysis of Public Transit Data

This notebook demonstrates a complete analysis pipeline for GTFS (General Transit Feed Specification) data, including:

- 📊 **Data Processing & Feature Engineering**
- 🗺️ **Network Analysis & Routing Algorithms**
- 🤖 **Machine Learning for Delay Prediction**
- 📈 **Demand Forecasting**
- 📱 **Interactive Visualizations & Dashboard**

---

## 🛠️ Setup and Imports

In [None]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from data_processing import GTFSProcessor
from routing import TransitRouter
from prediction import DelayPredictor, DemandForecaster
from visualization import TransitVisualizer

warnings.filterwarnings('ignore')

# Import custom modules
sys.path.append('../src')


# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All imports successful!")

## 📂 Data Loading and Initial Exploration

In [None]:
# Initialize GTFS processor
data_path = '../data'
gtfs = GTFSProcessor(data_path)

# Load all GTFS data files
loaded_data = gtfs.load_data()

print(f"\n📊 Loaded {len(loaded_data)} GTFS data files")
for name, df in loaded_data.items():
    print(f"  - {name}: {len(df):,} records")

In [None]:
# Get summary statistics
stats = gtfs.get_summary_statistics()

print("\n📈 Dataset Summary:")
print("=" * 40)
for key, value in stats.items():
    if isinstance(value, dict):
        print(f"{key}:")
        for k, v in value.items():
            print(f"  - {k}: {v}")
    else:
        print(f"{key}: {value:,}")

## 🔧 Feature Engineering and Data Processing

In [None]:
# Create engineered features
features_df = gtfs.create_features()

# Display sample of engineered features
print("\n🔍 Sample of Engineered Features:")
print("=" * 50)
display(features_df[[
    'stop_id', 'trip_id', 'route_id', 'hour', 'is_rush_hour', 
    'time_period', 'route_type_name', 'is_first_stop', 'is_last_stop'
]].head(10))

In [None]:
# Calculate travel times between stops
travel_times_df = gtfs.calculate_travel_times()

print("\n⏱️ Travel Time Analysis:")
print("=" * 40)
print(f"Total segments analyzed: {len(travel_times_df):,}")
print(f"Average travel time: {travel_times_df['travel_time_minutes'].mean():.2f} minutes")
print(f"Average distance: {travel_times_df['distance_km'].mean():.2f} km")
print(f"Average speed: {travel_times_df['speed_kmh'].mean():.2f} km/h")

# Display sample
display(travel_times_df.head())

## 🗺️ Network Analysis and Routing

In [None]:
# Initialize transit router
router = TransitRouter(gtfs)

# Build network graph
network_graph = router.build_network(weight_type='travel_time')

# Get network statistics
network_stats = router.get_network_statistics()

print("\n🕸️ Network Analysis Results:")
print("=" * 40)
for key, value in network_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.3f}")
    else:
        print(f"{key}: {value}")

In [None]:
# Analyze network centrality
centrality_measures = router.analyze_centrality(['degree', 'betweenness', 'closeness'])

# Find most important stops
print("\n🎯 Most Important Stops by Centrality:")
print("=" * 50)

for measure, values in centrality_measures.items():
    if values:  # Check if values exist
        top_stops = sorted(values.items(), key=lambda x: x[1], reverse=True)[:5]
        print(f"\n{measure.capitalize()} Centrality:")
        for i, (stop_id, score) in enumerate(top_stops, 1):
            stop_name = "Unknown"
            if gtfs.stops is not None:
                stop_info = gtfs.stops[gtfs.stops['stop_id'] == stop_id]
                if len(stop_info) > 0:
                    stop_name = stop_info.iloc[0].get('stop_name', 'Unknown')
            print(f"  {i}. {stop_name} ({stop_id}): {score:.4f}")

In [None]:
# Demonstrate shortest path finding
if gtfs.stops is not None and len(gtfs.stops) > 1:
    # Get two random stops for demonstration
    stops = gtfs.stops['stop_id'].tolist()
    start_stop = stops[0]
    end_stop = stops[min(len(stops)-1, 10)]  # Pick a stop not too far for demo
    
    print("\n🎯 Shortest Path Analysis:")
    print("=" * 40)
    print(f"From: {start_stop}")
    print(f"To: {end_stop}")
    
    # Find shortest path
    try:
        path, cost = router.find_shortest_path(start_stop, end_stop)
        if path:
            print("\n✅ Path found!")
            print(f"Number of stops: {len(path)}")
            print(f"Total cost: {cost:.2f} minutes")
            print(f"Path: {' → '.join(path[:5])}{'...' if len(path) > 5 else ''}")
        else:
            print("❌ No path found between these stops")
    except Exception as e:
        print(f"⚠️ Error finding path: {e}")

## 🤖 Machine Learning: Delay Prediction

In [None]:
# Initialize delay predictor
delay_predictor = DelayPredictor(gtfs)

# Prepare training data (with simulated delays)
training_data = delay_predictor.prepare_training_data(simulate_delays=True)

print("\n🎲 Training Data for Delay Prediction:")
print("=" * 45)
print(f"Training samples: {len(training_data):,}")
print(f"Features: {len([col for col in training_data.columns if col != 'delay_minutes'])}")
print("\nDelay Statistics:")
print(f"  Mean delay: {training_data['delay_minutes'].mean():.2f} minutes")
print(f"  Median delay: {training_data['delay_minutes'].median():.2f} minutes")
print(f"  Max delay: {training_data['delay_minutes'].max():.2f} minutes")
print(f"  Min delay: {training_data['delay_minutes'].min():.2f} minutes")

# Show sample
display(training_data.head())

In [None]:
# Train the delay prediction model
training_results = delay_predictor.train_model(training_data, model_type='random_forest')

print("\n🎯 Model Training Results:")
print("=" * 40)
print(f"Training MAE: {training_results['train_mae']:.3f} minutes")
print(f"Test MAE: {training_results['test_mae']:.3f} minutes")
print(f"Training RMSE: {training_results['train_rmse']:.3f} minutes")
print(f"Test RMSE: {training_results['test_rmse']:.3f} minutes")
print(f"Training R²: {training_results['train_r2']:.3f}")
print(f"Test R²: {training_results['test_r2']:.3f}")
print(f"CV MAE: {training_results['cv_mae']:.3f} ± {training_results['cv_mae_std']:.3f}")

# Feature importance
if 'feature_importance' in training_results:
    print("\n🔍 Top 5 Most Important Features:")
    for i, feature in enumerate(training_results['feature_importance'][:5], 1):
        print(f"  {i}. {feature['feature']}: {feature['importance']:.4f}")

In [None]:
# Demonstrate delay prediction
print("\n🔮 Delay Prediction Examples:")
print("=" * 40)

# Create sample scenarios
scenarios = [
    {
        'name': 'Morning Rush Hour - Bus',
        'hour': 8,
        'is_rush_hour': 1,
        'route_type': 3,  # Bus
        'stop_sequence': 5,
        'weather_condition': 0  # Normal weather
    },
    {
        'name': 'Midday - Subway',
        'hour': 14,
        'is_rush_hour': 0,
        'route_type': 1,  # Subway
        'stop_sequence': 10,
        'weather_condition': 1  # Mild weather issues
    },
    {
        'name': 'Evening Rush - Tram',
        'hour': 18,
        'is_rush_hour': 1,
        'route_type': 0,  # Tram
        'stop_sequence': 3,
        'weather_condition': 2  # Severe weather
    }
]

for scenario in scenarios:
    prediction = delay_predictor.predict_delay(scenario)
    print(f"\n{scenario['name']}:")
    print(f"  Predicted delay: {prediction:.2f} minutes")
    print(f"  Scenario: {scenario['hour']:02d}:00, Weather level: {scenario['weather_condition']}")

## 📈 Demand Forecasting

In [None]:
# Initialize demand forecaster
demand_forecaster = DemandForecaster(gtfs)

# Simulate ridership data
ridership_data = demand_forecaster.simulate_ridership_data(days=30)

print("\n📊 Ridership Data Simulation:")
print("=" * 40)
print(f"Total records: {len(ridership_data):,}")
print(f"Date range: {ridership_data['date'].min()} to {ridership_data['date'].max()}")
print(f"Total ridership: {ridership_data['ridership'].sum():,}")
print(f"Average daily ridership: {ridership_data.groupby('date')['ridership'].sum().mean():.0f}")

# Show sample
display(ridership_data.head(10))

In [None]:
# Analyze demand patterns
demand_patterns = demand_forecaster.get_demand_patterns()

print("\n📈 Demand Pattern Analysis:")
print("=" * 40)

# Peak hours
print("\n🕐 Peak Hours:")
for hour, ridership in demand_patterns['peak_hours'].items():
    print(f"  {hour}:00 - {ridership:.0f} average riders")

# Weekend vs weekday
print("\n📅 Weekend vs Weekday:")
for is_weekend, ridership in demand_patterns['weekend_vs_weekday'].items():
    day_type = "Weekend" if is_weekend else "Weekday"
    print(f"  {day_type}: {ridership:.0f} average riders")

# Top stops
print("\n🚏 Top 5 Busiest Stops:")
for i, (stop_id, total_ridership) in enumerate(list(demand_patterns['top_stops'].items())[:5], 1):
    print(f"  {i}. {stop_id}: {total_ridership:,.0f} total riders")

In [None]:
# Generate ridership forecast
forecast_data = demand_forecaster.forecast_ridership(forecast_days=7)

print("\n🔮 7-Day Ridership Forecast:")
print("=" * 40)

# Daily totals
daily_forecast = forecast_data.groupby('date').agg({
    'forecasted_ridership': 'sum',
    'confidence_lower': 'sum',
    'confidence_upper': 'sum'
}).round(0)

print("Daily Forecast Summary:")
display(daily_forecast)

# Calculate forecast accuracy simulation
total_forecast = daily_forecast['forecasted_ridership'].sum()
avg_daily_forecast = daily_forecast['forecasted_ridership'].mean()
print(f"\nTotal 7-day forecast: {total_forecast:,.0f} riders")
print(f"Average daily forecast: {avg_daily_forecast:,.0f} riders")

## 📊 Interactive Visualizations

In [None]:
# Initialize visualizer
visualizer = TransitVisualizer(gtfs, router)

print("🎨 Creating Interactive Visualizations...\n")

In [None]:
# Create route analysis plots
route_fig = visualizer.plot_route_analysis()
route_fig.show()

print("✅ Route analysis visualization complete!")

In [None]:
# Create delay prediction visualizations
delay_fig = visualizer.plot_delay_predictions(delay_predictor)
delay_fig.show()

print("✅ Delay prediction visualization complete!")

In [None]:
# Create network map
try:
    network_map = visualizer.plot_network_map(interactive=True)
    if network_map:
        # Save map
        network_map.save('../outputs/transit_network_map.html')
        print("✅ Interactive network map created and saved!")
        print("📁 Map saved as: ../outputs/transit_network_map.html")
    else:
        print("⚠️ Could not create interactive map (insufficient coordinate data)")
except Exception as e:
    print(f"⚠️ Map creation failed: {e}")
    print("Creating static network visualization instead...")
    visualizer.plot_network_map(interactive=False)

## 📱 Comprehensive Dashboard

In [None]:
# Create comprehensive interactive dashboard
dashboard_fig = visualizer.create_interactive_dashboard(
    delay_predictor=delay_predictor,
    demand_forecaster=demand_forecaster
)

dashboard_fig.show()

print("✅ Comprehensive dashboard created!")

In [None]:
# Save dashboard as HTML
try:
    visualizer.save_dashboard_html(dashboard_fig, '../outputs/transit_dashboard.html')
    print("💾 Dashboard saved successfully!")
    print("📁 Dashboard saved as: ../outputs/transit_dashboard.html")
except Exception as e:
    print(f"⚠️ Could not save dashboard: {e}")

## 🎯 Real-time Analysis Demo

In [None]:
# Simulate real-time analysis for current hour
from datetime import datetime

current_hour = datetime.now().hour
print(f"\n⏰ Real-time Analysis for {current_hour}:00")
print("=" * 45)

# Get routes with current delays
current_delays = visualizer.find_route_with_delay(current_hour, delay_predictor)

print("\n🚨 Routes with Highest Predicted Delays:")
for i, route_data in enumerate(current_delays[:5], 1):
    route_id = route_data['route_id']
    delay = route_data['predicted_delay']
    category = route_data['delay_category']
    
    status_emoji = "🔴" if delay > 5 else "🟡" if delay > 2 else "🟢"
    print(f"  {i}. {status_emoji} {route_id}: {delay:.1f} min delay ({category})")

# Current ridership prediction
current_ridership = demand_forecaster.get_demand_patterns()['hourly'].get(current_hour, 0)
print(f"\n👥 Expected ridership this hour: {current_ridership:.0f} passengers")

# System performance summary
avg_delay = np.mean([r['predicted_delay'] for r in current_delays])
high_delay_routes = len([r for r in current_delays if r['predicted_delay'] > 5])

print("\n📊 System Performance Summary:")
print(f"  Average delay: {avg_delay:.1f} minutes")
print(f"  Routes with high delays: {high_delay_routes}/{len(current_delays)}")
print(f"  Network connectivity: {'Good' if network_stats.get('is_connected', False) else 'Limited'}")
print(f"  Total active routes: {len(gtfs.routes) if gtfs.routes is not None else 'Unknown'}")

## 📋 Summary and Insights

In [None]:
print("\n🎉 GTFS Transit Analysis Complete!")
print("=" * 50)

print("\n📊 Key Findings:")
print(f"  • Analyzed {stats.get('num_routes', 'N/A')} routes across {stats.get('num_stops', 'N/A')} stops")
print(f"  • Network has {network_stats['num_nodes']} nodes and {network_stats['num_edges']} connections")
print(f"  • Average travel time: {travel_times_df['travel_time_minutes'].mean():.1f} minutes")
print(f"  • ML model accuracy: {training_results['test_r2']:.3f} R²")
print(f"  • 7-day ridership forecast: {total_forecast:,.0f} passengers")

print("\n🛠️ Generated Outputs:")
print("  • Interactive network map")
print("  • Delay prediction model")
print("  • Demand forecasting system")
print("  • Comprehensive dashboard")
print("  • Real-time analysis capability")

print("\n🎯 Use Cases Demonstrated:")
print("  • Route optimization")
print("  • Delay prediction and management")
print("  • Capacity planning")
print("  • Real-time passenger information")
print("  • Network performance monitoring")

print("\n✨ This analysis provides a foundation for:")
print("  📈 Data-driven transit planning")
print("  🤖 Predictive maintenance")
print("  👥 Passenger experience optimization")
print("  📱 Real-time information systems")
print("  🌍 Sustainable transportation insights")

print("\n" + "=" * 50)
print("🚊 Thank you for exploring GTFS Transit Analysis! 🚊")
print("=" * 50)