# Prophet Building for Univariate
## With Grid search Hyperparameters training

In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../data/processed/final_df.csv', parse_dates=True, index_col=0)

# Ensure the index is in datetime format
df.index = pd.to_datetime(df.index)
df_copy =df.copy()

In [4]:
# Example for one location
location = 'Little Collins St-Swanston St (East)'

# Prepare the data for Prophet (from the copied dataset)
data = df_copy[['Hour', location, 'IsPublicHoliday', 'temp', 'humidity', 'rain_1h', 'clouds_all']].rename(
    columns={'Hour': 'ds', location: 'y'}
)

# Convert 'ds' to datetime format
data['ds'] = pd.to_datetime(data['ds'])

In [6]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_squared_error, r2_score
from itertools import product
import joblib

# Define evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    y_true_nonzero = np.where(y_true == 0, np.nan, y_true)  # Avoid divide-by-zero for MAPE
    mape = (np.abs((y_true - y_pred) / y_true_nonzero)).mean() * 100  # Exclude NaNs
    r2 = r2_score(y_true, y_pred)
    return rmse, mape, r2

# Load the dataset
df = pd.read_csv('../data/processed/final_df.csv', parse_dates=True, index_col=0)
df.index = pd.to_datetime(df.index)
df_copy = df.copy()

# Locations to process
locations = [
    'Little Collins St-Swanston St (East)',
    'Faraday St-Lygon St (West)',
    'Melbourne Central',
    'Chinatown-Lt Bourke St (South)',
    'Lonsdale St (South)'
]

results = {}  # To store evaluation metrics
best_params = {}  # To store the best hyperparameters for each location

# Define hyperparameter grid
seasonality_modes = ['additive', 'multiplicative']
changepoint_prior_scales = [0.05, 0.1, 0.5, 1.0]  # Flexibility for trend changes
seasonality_prior_scales = [1.0, 10.0]  # Flexibility for seasonal components

param_grid = list(product(seasonality_modes, changepoint_prior_scales, seasonality_prior_scales))

# Process each location
for location in locations:
    print(f"Processing {location}...")

    # Prepare data for the current location
    data = df_copy[['Hour', location]].rename(columns={'Hour': 'ds', location: 'y'})
    data['ds'] = pd.to_datetime(data['ds'])
    data = data[data['y'] > 0]  # Remove zero counts for stability

    # Split into training and testing (last 16 days for testing)
    split_index = int(len(data) - 16 * 24)
    train_data = data.iloc[:split_index]
    test_data = data.iloc[split_index:]

    # Initialize variables to track the best parameters
    best_rmse = float('inf')
    best_model = None
    best_params_location = None

    # Grid search over hyperparameters
    for seasonality_mode, changepoint_prior_scale, seasonality_prior_scale in param_grid:
        try:
            # Initialize Prophet model with current hyperparameters
            model = Prophet(
                seasonality_mode=seasonality_mode,
                changepoint_prior_scale=changepoint_prior_scale,
                seasonality_prior_scale=seasonality_prior_scale,
            )
            model.fit(train_data)

            # Predict on the test set
            future = test_data[['ds']]
            forecast = model.predict(future)

            # Evaluate performance
            y_true = test_data['y'].values
            y_pred = forecast['yhat'].values
            rmse, _, _ = calculate_metrics(y_true, y_pred)

            # Update the best model if the current one is better
            if rmse < best_rmse:
                best_rmse = rmse
                best_model = model
                best_params_location = (seasonality_mode, changepoint_prior_scale, seasonality_prior_scale)
        except Exception as e:
            print(f"Error with parameters {seasonality_mode}, {changepoint_prior_scale}, {seasonality_prior_scale}: {e}")

    # Save the best model and parameters
    model_filename = f'univariate_model_{location.replace(" ", "_").replace("–", "_")}.pkl'
    joblib.dump(best_model, model_filename)
    best_params[location] = best_params_location

    # Predict with the best model on the test set
    forecast = best_model.predict(test_data[['ds']])
    y_true = test_data['y'].values
    y_pred = forecast['yhat'].values
    rmse, mape, r2 = calculate_metrics(y_true, y_pred)

    # Store the metrics
    results[location] = {'RMSE': rmse, 'MAPE': mape, 'R²': r2}
    print(f"Best parameters for {location}: {best_params_location}")
    print(f"Metrics for {location}: RMSE={rmse}, MAPE={mape}, R²={r2}")

# Save the results and best parameters to CSV files
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv('univariate_evaluation_metrics.csv', index=True)
print("Evaluation metrics saved to 'univariate_evaluation_metrics.csv'")

best_params_df = pd.DataFrame.from_dict(best_params, orient='index', columns=['Seasonality_Mode', 'Changepoint_Prior_Scale', 'Seasonality_Prior_Scale'])
best_params_df.to_csv('univariate_best_hyperparameters.csv', index=True)
print("Best hyperparameters saved to 'univariate_best_hyperparameters.csv'")

23:25:36 - cmdstanpy - INFO - Chain [1] start processing


Processing Little Collins St-Swanston St (East)...


23:25:38 - cmdstanpy - INFO - Chain [1] done processing
23:25:38 - cmdstanpy - INFO - Chain [1] start processing
23:25:39 - cmdstanpy - INFO - Chain [1] done processing
23:25:39 - cmdstanpy - INFO - Chain [1] start processing
23:25:40 - cmdstanpy - INFO - Chain [1] done processing
23:25:40 - cmdstanpy - INFO - Chain [1] start processing
23:25:41 - cmdstanpy - INFO - Chain [1] done processing
23:25:41 - cmdstanpy - INFO - Chain [1] start processing
23:25:42 - cmdstanpy - INFO - Chain [1] done processing
23:25:42 - cmdstanpy - INFO - Chain [1] start processing
23:25:44 - cmdstanpy - INFO - Chain [1] done processing
23:25:44 - cmdstanpy - INFO - Chain [1] start processing
23:25:45 - cmdstanpy - INFO - Chain [1] done processing
23:25:45 - cmdstanpy - INFO - Chain [1] start processing
23:25:46 - cmdstanpy - INFO - Chain [1] done processing
23:25:46 - cmdstanpy - INFO - Chain [1] start processing
23:25:47 - cmdstanpy - INFO - Chain [1] done processing
23:25:47 - cmdstanpy - INFO - Chain [1] 

Best parameters for Little Collins St-Swanston St (East): ('additive', 0.05, 1.0)
Metrics for Little Collins St-Swanston St (East): RMSE=795.2730664488759, MAPE=544.6278170853279, R²=-0.7230904627869834
Processing Faraday St-Lygon St (West)...


23:25:57 - cmdstanpy - INFO - Chain [1] done processing
23:25:57 - cmdstanpy - INFO - Chain [1] start processing
23:25:58 - cmdstanpy - INFO - Chain [1] done processing
23:25:59 - cmdstanpy - INFO - Chain [1] start processing
23:26:00 - cmdstanpy - INFO - Chain [1] done processing
23:26:00 - cmdstanpy - INFO - Chain [1] start processing
23:26:01 - cmdstanpy - INFO - Chain [1] done processing
23:26:01 - cmdstanpy - INFO - Chain [1] start processing
23:26:02 - cmdstanpy - INFO - Chain [1] done processing
23:26:02 - cmdstanpy - INFO - Chain [1] start processing
23:26:03 - cmdstanpy - INFO - Chain [1] done processing
23:26:03 - cmdstanpy - INFO - Chain [1] start processing
23:26:04 - cmdstanpy - INFO - Chain [1] done processing
23:26:04 - cmdstanpy - INFO - Chain [1] start processing
23:26:05 - cmdstanpy - INFO - Chain [1] done processing
23:26:05 - cmdstanpy - INFO - Chain [1] start processing
23:26:06 - cmdstanpy - INFO - Chain [1] done processing
23:26:07 - cmdstanpy - INFO - Chain [1] 

Best parameters for Faraday St-Lygon St (West): ('additive', 0.05, 1.0)
Metrics for Faraday St-Lygon St (West): RMSE=261.42296446277, MAPE=1379.7792857894726, R²=-1.0129235842459523
Processing Melbourne Central...


23:26:16 - cmdstanpy - INFO - Chain [1] done processing
23:26:16 - cmdstanpy - INFO - Chain [1] start processing
23:26:17 - cmdstanpy - INFO - Chain [1] done processing
23:26:18 - cmdstanpy - INFO - Chain [1] start processing
23:26:19 - cmdstanpy - INFO - Chain [1] done processing
23:26:19 - cmdstanpy - INFO - Chain [1] start processing
23:26:20 - cmdstanpy - INFO - Chain [1] done processing
23:26:21 - cmdstanpy - INFO - Chain [1] start processing
23:26:22 - cmdstanpy - INFO - Chain [1] done processing
23:26:22 - cmdstanpy - INFO - Chain [1] start processing
23:26:23 - cmdstanpy - INFO - Chain [1] done processing
23:26:23 - cmdstanpy - INFO - Chain [1] start processing
23:26:24 - cmdstanpy - INFO - Chain [1] done processing
23:26:24 - cmdstanpy - INFO - Chain [1] start processing
23:26:26 - cmdstanpy - INFO - Chain [1] done processing
23:26:26 - cmdstanpy - INFO - Chain [1] start processing
23:26:27 - cmdstanpy - INFO - Chain [1] done processing
23:26:27 - cmdstanpy - INFO - Chain [1] 

Best parameters for Melbourne Central: ('additive', 0.05, 1.0)
Metrics for Melbourne Central: RMSE=1169.375835229383, MAPE=630.7643235735484, R²=-1.2134189926503254
Processing Chinatown-Lt Bourke St (South)...


23:26:38 - cmdstanpy - INFO - Chain [1] done processing
23:26:38 - cmdstanpy - INFO - Chain [1] start processing
23:26:39 - cmdstanpy - INFO - Chain [1] done processing
23:26:39 - cmdstanpy - INFO - Chain [1] start processing
23:26:41 - cmdstanpy - INFO - Chain [1] done processing
23:26:41 - cmdstanpy - INFO - Chain [1] start processing
23:26:42 - cmdstanpy - INFO - Chain [1] done processing
23:26:42 - cmdstanpy - INFO - Chain [1] start processing
23:26:43 - cmdstanpy - INFO - Chain [1] done processing
23:26:43 - cmdstanpy - INFO - Chain [1] start processing
23:26:45 - cmdstanpy - INFO - Chain [1] done processing
23:26:45 - cmdstanpy - INFO - Chain [1] start processing
23:26:46 - cmdstanpy - INFO - Chain [1] done processing
23:26:46 - cmdstanpy - INFO - Chain [1] start processing
23:26:47 - cmdstanpy - INFO - Chain [1] done processing
23:26:47 - cmdstanpy - INFO - Chain [1] start processing
23:26:48 - cmdstanpy - INFO - Chain [1] done processing
23:26:48 - cmdstanpy - INFO - Chain [1] 

Best parameters for Chinatown-Lt Bourke St (South): ('additive', 0.05, 1.0)
Metrics for Chinatown-Lt Bourke St (South): RMSE=400.7509358210157, MAPE=769.8934787883353, R²=-0.7950815818160903
Processing Lonsdale St (South)...


23:26:59 - cmdstanpy - INFO - Chain [1] done processing
23:26:59 - cmdstanpy - INFO - Chain [1] start processing
23:27:00 - cmdstanpy - INFO - Chain [1] done processing
23:27:00 - cmdstanpy - INFO - Chain [1] start processing
23:27:02 - cmdstanpy - INFO - Chain [1] done processing
23:27:02 - cmdstanpy - INFO - Chain [1] start processing
23:27:03 - cmdstanpy - INFO - Chain [1] done processing
23:27:04 - cmdstanpy - INFO - Chain [1] start processing
23:27:05 - cmdstanpy - INFO - Chain [1] done processing
23:27:05 - cmdstanpy - INFO - Chain [1] start processing
23:27:06 - cmdstanpy - INFO - Chain [1] done processing
23:27:06 - cmdstanpy - INFO - Chain [1] start processing
23:27:07 - cmdstanpy - INFO - Chain [1] done processing
23:27:07 - cmdstanpy - INFO - Chain [1] start processing
23:27:08 - cmdstanpy - INFO - Chain [1] done processing
23:27:09 - cmdstanpy - INFO - Chain [1] start processing
23:27:10 - cmdstanpy - INFO - Chain [1] done processing
23:27:10 - cmdstanpy - INFO - Chain [1] 

Best parameters for Lonsdale St (South): ('additive', 0.05, 1.0)
Metrics for Lonsdale St (South): RMSE=403.3880569475746, MAPE=304.2241753610934, R²=-1.0575131297623082
Evaluation metrics saved to 'univariate_evaluation_metrics.csv'
Best hyperparameters saved to 'univariate_best_hyperparameters.csv'


