In [1]:
from datetime import datetime
import pandas as pd
from ibb_transportation_forecast.data_loader import DataLoader
from ibb_transportation_forecast.preprocessor import DataPreprocessor
from ibb_transportation_forecast.model import MetroForecastModel
from ibb_transportation_forecast.utils import plot_results, load_config

In [2]:
def load_and_prepare_data() -> pd.DataFrame:
    """Load and prepare all required data sources"""
    print("\n=== Loading Data ===")
    loader = DataLoader()
    
    # Load core datasets
    transport_df = loader.load_transport_data()
    weather_df = loader.load_weather_data()
    
    # Load supporting data
    holidays_df = loader.load_bank_holidays()
    football_df = loader.load_football_schedule()
    school_df = loader.load_school_terms()
    ramadan_df = loader.load_ramadan_dates()
    
    print("\n=== Preprocessing Data ===")
    preprocessor = DataPreprocessor()
    
    # Clean and prepare transport data
    transport_clean = preprocessor.clean_transport_data(transport_df)
    transport_agg = preprocessor.aggregate_transport_data(transport_clean)
    transport_final = preprocessor.remove_transport_outliers(transport_agg)
    
    # Merge all data sources
    combined_data = preprocessor.merge_supporting_data(
        transport_final, weather_df, holidays_df, 
        football_df, school_df, ramadan_df
    )
    
    print(f"\nFinal combined dataset shape: {combined_data.shape}")
    return combined_data

def run_forecast_for_station(data: pd.DataFrame, station: str, model_name: str) -> dict:
    """Run forecasting pipeline for a specific station"""
    print(f"\n=== Running Forecast for {station} ===")
    preprocessor = DataPreprocessor()
    model = MetroForecastModel()
    
    # Prepare station-specific data
    station_data = preprocessor.prepare_station_data(data, station)
    
    # Run forecasting
    forecast, actual, metrics = model.forecast(model_name, station_data)
    
    # Display results
    print("\nModel Performance:")
    print(f"- MAE: {metrics['MAE']:.2f}")
    print(f"- MSE: {metrics['MSE']:.2f}")
    print(f"- R²: {metrics['R2']:.2f}")
    print(f"- Duration: {metrics['duration']:.2f} seconds")
    
    # Plot results
    plot_results(station, model_name, forecast, actual)
    
    return {
        'station': station,
        'model': model_name,
        'forecast': forecast,
        'actual': actual,
        'metrics': metrics
    }


In [6]:
config = load_config()

# Load and prepare data
try:
    combined_data = load_and_prepare_data()
except Exception as e:
    print(f"Error loading data: {str(e)}")

# Get configuration parameters
stations_to_run = config['model_settings'].get('stations')
models_to_run = config['model_settings']['models']



=== Loading Data ===
Loaded transportation data with 404598 records
Loading weather data from S3...
Loaded 21 bank holidays



Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



Loaded 404 football matches
Loaded 4 school terms
Loaded 89 Ramadan dates

=== Preprocessing Data ===

Final combined dataset shape: (193407, 21)


In [None]:
# Run forecasts
results = []
for station in stations_to_run:
    for model_name in models_to_run:
        try:
            result = run_forecast_for_station(combined_data, station, model_name)
            results.append(result)
        except Exception as e:
            print(f"Error processing {station} with {model_name}: {str(e)}")
            continue

# Optionally save results
if results:
    # Extract the metrics dictionary inside the results list and convert it to a DataFrame, combine with the station and model names
    results_df = pd.DataFrame([result['metrics'] for result in results])
    results_df['station'] = [result['station'] for result in results]
    results_df['model'] = [result['model'] for result in results]
    results_df = results_df[['station', 'model', 'MAE', 'MSE', 'R2', 'duration']]

    # Save results to CSV with the datetime in the filename
    results_df = pd.DataFrame(results)
    results_df.to_csv(f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False)

print("=== Istanbul Metro Passenger Forecasting ===")
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print("\nForecasting completed!")


=== Running Forecast for SISLI ===

Model Performance:
- MAE: 1717.78
- MSE: 10239763.31
- R²: 0.70
- Duration: 0.07 seconds



=== Running Forecast for SISLI ===

Model Performance:
- MAE: 495.20
- MSE: 1452067.20
- R²: 0.96
- Duration: 0.25 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 407.99
- MSE: 898911.18
- R²: 0.97
- Duration: 15.98 seconds



=== Running Forecast for SISLI ===

Model Performance:
- MAE: 643.39
- MSE: 1700592.28
- R²: 0.95
- Duration: 1.50 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 510.47
- MSE: 1135275.94
- R²: 0.97
- Duration: 1.32 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 649
[LightGBM] [Info] Number of data points in the train set: 10706, number of used features: 51
[LightGBM] [Info] Start training from score 4364.586587

Model Performance:
- MAE: 496.49
- MSE: 935789.42
- R²: 0.97
- Duration: 4.89 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 1412.09
- MSE: 4529467.43
- R²: 0.59
- Duration: 0.06 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 402.76
- MSE: 1049028.41
- R²: 0.90
- Duration: 0.30 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 286.37
- MSE: 497096.42
- R²: 0.95
- Duration: 12.44 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 370.30
- MSE: 582243.25
- R²: 0.95
- Duration: 1.59 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 337.61
- MSE: 559069.40
- R²: 0.95
- Duration: 1.21 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 649
[LightGBM] [Info] Number of data points in the train set: 10597, number of used features: 51
[LightGBM] [Info] Start training from score 2192.846183

Model Performance:
- MAE: 335.45
- MSE: 611600.63
- R²: 0.94
- Duration: 2.44 seconds


=== Istanbul Metro Passenger Forecasting ===
Started at: 2025-03-26 20:10:53

Forecasting completed!


In [None]:
def main():
    """Main execution function"""
    config = load_config()
    
    # Load and prepare data
    try:
        combined_data = load_and_prepare_data()
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return
    
    # Get configuration parameters
    stations_to_run = config['model_settings'].get('stations')
    models_to_run = config['model_settings']['models']
    
    # Run forecasts
    results = []
    for station in stations_to_run:
        for model_name in models_to_run:
            try:
                result = run_forecast_for_station(combined_data, station, model_name)
                results.append(result)
            except Exception as e:
                print(f"Error processing {station} with {model_name}: {str(e)}")
                continue
    
    # Optionally save results
    if results:
        pd.DataFrame([r['metrics'] for r in results]).to_csv('forecast_results.csv', index=False)
        print("\nSaved results to forecast_results.csv")

if __name__ == "__main__":
    print("=== Istanbul Metro Passenger Forecasting ===")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    main()
    print("\nForecasting completed!")

=== Istanbul Metro Passenger Forecasting ===
Started at: 2025-03-26 19:35:38

=== Loading Data ===
Loaded transportation data with 404598 records
Loading weather data from S3...
Loaded 21 bank holidays
Loaded 404 football matches
Loaded 4 school terms


  df['date'] = pd.to_datetime(df['date'])


Loaded 89 Ramadan dates

=== Preprocessing Data ===

Final combined dataset shape: (193407, 21)

=== Running Forecast for SISLI ===

Model Performance:
- MAE: 1717.78
- MSE: 10239763.31
- R²: 0.70
- Duration: 0.12 seconds



=== Running Forecast for SISLI ===

Model Performance:
- MAE: 481.00
- MSE: 1236081.84
- R²: 0.96
- Duration: 0.25 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 401.16
- MSE: 879197.65
- R²: 0.97
- Duration: 14.98 seconds



=== Running Forecast for SISLI ===

Model Performance:
- MAE: 643.39
- MSE: 1700592.28
- R²: 0.95
- Duration: 1.42 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 510.47
- MSE: 1135275.94
- R²: 0.97
- Duration: 1.30 seconds



=== Running Forecast for SISLI ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 649
[LightGBM] [Info] Number of data points in the train set: 10706, number of used features: 51
[LightGBM] [Info] Start training from score 4364.586587

Model Performance:
- MAE: 496.49
- MSE: 935789.42
- R²: 0.97
- Duration: 4.73 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 1412.09
- MSE: 4529467.43
- R²: 0.59
- Duration: 0.06 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 330.11
- MSE: 631448.08
- R²: 0.94
- Duration: 0.28 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 279.56
- MSE: 490928.92
- R²: 0.96
- Duration: 13.69 seconds



=== Running Forecast for TAKSIM ===

Model Performance:
- MAE: 372.11
- MSE: 588253.27
- R²: 0.95
- Duration: 1.46 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.




Model Performance:
- MAE: 337.61
- MSE: 559069.40
- R²: 0.95
- Duration: 1.10 seconds



=== Running Forecast for TAKSIM ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits



The total space of parameters 4 is smaller than n_iter=5. Running 4 iterations. For exhaustive searches, use GridSearchCV.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 649
[LightGBM] [Info] Number of data points in the train set: 10597, number of used features: 51
[LightGBM] [Info] Start training from score 2192.846183

Model Performance:
- MAE: 335.45
- MSE: 611600.63
- R²: 0.94
- Duration: 2.20 seconds



Saved results to forecast_results.csv

Forecasting completed!
