In [None]:
import pandas as pd

#Loading

In [None]:
# Read csv
df = pd.read_csv('~/Desktop/DATA_PROJECT/HSG_BA_and_DS_Applications/data/processed/final_df.csv', parse_dates=True, index_col=0)
df.index = pd.to_datetime(df.index)
df_copy =df.copy()

#Exponential Smoothing

In [None]:
# Example for one location
location = 'Little Collins St-Swanston St (East)'

# Prepare the data for Prophet (from the copied dataset)
data = df_copy[['Hour', location, 'IsPublicHoliday', 'temp', 'humidity', 'rain_1h', 'clouds_all']].rename(
    columns={'Hour': 'ds', location: 'y'}
)

# Convert 'ds' to datetime format
data['ds'] = pd.to_datetime(data['ds'])

#Evaluation metrics and grid search Hyperparameters training

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
# Define evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    y_true_nonzero = np.where(y_true == 0, np.nan, y_true)  # Avoid divide-by-zero for MAPE
    mape = (np.abs((y_true - y_pred) / y_true_nonzero)).mean() * 100  # Exclude NaNs
    r2 = r2_score(y_true, y_pred)
    return rmse, mape, r2

In [None]:
# Load the dataset
df = pd.read_csv('~/Desktop/DATA_PROJECT/HSG_BA_and_DS_Applications/data/processed/final_df.csv', parse_dates=True, index_col=0)
df.index = pd.to_datetime(df.index)
df_copy = df.copy()

In [None]:
# Locations to process
locations = [
    'Little Collins St-Swanston St (East)',
    'Faraday St-Lygon St (West)',
    'Melbourne Central',
    'Chinatown-Lt Bourke St (South)',
    'Lonsdale St (South)'
]

In [None]:
results = {}  # To store evaluation metrics
best_params = {}  # To store the best hyperparameters for each location

In [None]:
# Define hyperparameter grid for Exponential Smoothing
trend_options = [None, 'add', 'mul']  # Trend component: None, additive, or multiplicative
seasonal_options = [None, 'add', 'mul']  # Seasonal component: None, additive, or multiplicative
seasonal_periods = [24, 7 * 24]  # Hourly (daily) and weekly seasonalities

In [None]:
# Process each location
for location in locations:
    print(f"Processing {location}...")

    # Prepare data for the current location
    data = df_copy[['Hour', location]].rename(columns={'Hour': 'ds', location: 'y'})
    data['ds'] = pd.to_datetime(data['ds'])
    data = data[data['y'] > 0]  # Remove zero counts for stability

    # Split into training and testing (last 16 days for testing)
    split_index = int(len(data) - 16 * 24)
    train_data = data.iloc[:split_index]
    test_data = data.iloc[split_index:]

    # Extract target variable
    y_train = train_data['y']
    y_test = test_data['y']

    # Initialize variables to track the best parameters
    best_rmse = float('inf')
    best_model = None
    best_params_location = None

    # Grid search over hyperparameters
    for trend in trend_options:
        for seasonal in seasonal_options:
            for seasonal_period in seasonal_periods:
                try:
                    # Initialize and fit the Exponential Smoothing model
                    model = ExponentialSmoothing(
                        y_train,
                        trend=trend,
                        seasonal=seasonal,
                        seasonal_periods=seasonal_period,
                        initialization_method='estimated'
                    )
                    fitted_model = model.fit()

                    # Forecast on the test set
                    y_pred = fitted_model.forecast(steps=len(y_test))

                    # Evaluate performance
                    rmse, mape, r2 = calculate_metrics(y_test.values, y_pred)

                    # Update the best model if the current one is better
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_model = fitted_model
                        best_params_location = (trend, seasonal, seasonal_period)
                except Exception as e:
                    print(f"Error with parameters (trend={trend}, seasonal={seasonal}, seasonal_period={seasonal_period}): {e}")

    # Save the best model and parameters
    model_filename = f'ets_model_{location.replace(" ", "_").replace("–", "_")}.pkl'
    joblib.dump(best_model, model_filename)
    best_params[location] = best_params_location

    # Forecast on the test set using the best model
    y_pred = best_model.forecast(steps=len(y_test))

    # Evaluate final performance
    rmse, mape, r2 = calculate_metrics(y_test.values, y_pred)

    # Store the metrics
    results[location] = {'RMSE': rmse, 'MAPE': mape, 'R²': r2}
    print(f"Best parameters for {location}: {best_params_location}")
    print(f"Metrics for {location}: RMSE={rmse}, MAPE={mape}, R²={r2}")

In [None]:
# Save the results and best parameters to CSV files
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv('ets_evaluation_metrics.csv', index=True)
print("Evaluation metrics saved to 'ets_evaluation_metrics.csv'")

best_params_df = pd.DataFrame.from_dict(best_params, orient='index', columns=['Trend', 'Seasonal', 'Seasonal_Period'])
best_params_df.to_csv('ets_best_hyperparameters.csv', index=True)
print("Best hyperparameters saved to 'ets_best_hyperparameters.csv'")

#Predictions

In [None]:
import pandas as pd

In [None]:
# Initialize a DataFrame to store predictions
predictions_df = pd.DataFrame()


In [None]:
# Loop through each location to generate predictions and save them
for location in locations:
    print(f"Processing {location}...")

    # Prepare data for the current location
    data = df_copy[['Hour', location]].rename(columns={'Hour': 'ds', location: 'y'})
    data['ds'] = pd.to_datetime(data['ds'])
    data = data[data['y'] > 0]  # Remove zero counts for stability

    # Split into training and testing (last 16 days for testing)
    split_index = int(len(data) - 16 * 24)
    train_data = data.iloc[:split_index]
    test_data = data.iloc[split_index:]

    # Extract target variable
    y_test = test_data['y']

    # Forecast on the test set using the best model
    y_pred = best_model.forecast(steps=len(y_test))

    # Add predictions and actual values to the DataFrame
    predictions_df[f"{location}_predicted"] = y_pred
    predictions_df[f"{location}_actual"] = y_test.values

In [None]:
# Add a timestamp index from the test data
predictions_df.index = test_data['ds'].values  # Assuming 'ds' is the timestamp column in test data

In [None]:
# Save the predictions to a CSV file
predictions_df.to_csv('ets_predictions.csv', index=True)
print("Predictions saved to 'ets_predictions.csv'")