In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Train.csv')
test_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_test.csv')
validation_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Validation.csv')

In [3]:
# Function for preprocessing the datasets
def preprocess_data(df):
    df['local_time'] = pd.to_datetime(df['local_time'])
    for time_unit in ['Year', 'Month', 'Day', 'Hour']:
        df[time_unit] = getattr(df['local_time'].dt, time_unit.lower())
    return df.drop('local_time', axis=1)

In [4]:
# Preprocessing all the datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
validation_df = preprocess_data(validation_df)

In [5]:
# Prepare the feature sets and target variables
def get_features_targets(df):
    X = df[['Year', 'Month', 'Day', 'Hour', 'Average_Temp']]
    y = df['MW']
    return X, y

X_train, y_train = get_features_targets(train_df)
X_validation, y_validation = get_features_targets(validation_df)
X_test, y_test = get_features_targets(test_df)

In [6]:
# Evaluation Function

def calculate_metrics(actual, predicted, lower_bound=0, upper_bound=100, iqr_multiplier=1.5):
    # Excluding negative actual values if considered invalid
    valid_indices = actual > lower_bound
    actual = actual[valid_indices]
    predicted = predicted[valid_indices]

    # MAE and RMSE calculations
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))

    # Thresholding for outlier exclusion based on IQR
    q1, q3 = np.percentile(actual, [25, 75])
    iqr = q3 - q1
    outlier_threshold_upper = q3 + (iqr * iqr_multiplier)
    outlier_threshold_lower = q1 - (iqr * iqr_multiplier)

    valid_indices_for_mape = (actual >= outlier_threshold_lower) & (actual <= outlier_threshold_upper)
    filtered_actual = actual[valid_indices_for_mape]
    filtered_predicted = predicted[valid_indices_for_mape]

    # MAPE will be capped at 100 if it goes above 100%
    if len(filtered_actual) > 0:
        percentage_errors = np.abs((filtered_predicted - filtered_actual) / filtered_actual) * 100
        percentage_errors = np.clip(percentage_errors, None, upper_bound)  # Cap percentage errors at upper_bound (100%)
        mape = np.mean(percentage_errors)
    else:
        mape = np.nan

    # sMAPE calculation
    smape = 100/len(actual) * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

    return mae, mape, smape, rmse

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from joblib import dump


# Pipeline for preprocessing and modeling
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor(random_state=42))
])

# Grid of parameters to search
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 4, 5]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1)

grid_search.fit(X_train, y_train)

# Best model retrival
best_model_gradient = grid_search.best_estimator_

# Save the model to a file
model_filename = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_Gradient_Boost_model.joblib'
dump(best_model_gradient, model_filename)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


['/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_Gradient_Boost_model.joblib']

In [8]:
from joblib import load
model_filename = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_Gradient_Boost_model.joblib'

loaded_best_model_gradient = load(model_filename)

In [9]:
# Predicting and evaluating on the validation set
predictions_validation = loaded_best_model_gradient.predict(X_validation)
metrics_validation = calculate_metrics(y_validation, predictions_validation)
print("Validation Metrics after tuning (MAE, MAPE, sMAPE,RMSE):", metrics_validation)


Validation Metrics after tuning (MAE, MAPE, sMAPE,RMSE): (49.95926375923622, 60.54155952311222, 61.72224857736984, 76.97656701281181)


In [10]:
# Predicting and evaluating on the test set
predictions_test = loaded_best_model_gradient.predict(X_test)
metrics_test = calculate_metrics(y_test, predictions_test)
print("Test Metrics with Gradient Boosting (MAE, MAPE, sMAPE, RMSE):", metrics_test)

Test Metrics with Gradient Boosting (MAE, MAPE, sMAPE, RMSE): (22.417985977090343, 26.61286998284016, 33.21977198571935, 41.16166733794306)
