In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
# Load the datasets
train_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Train.csv')
test_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_test.csv')
validation_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Validation.csv')

In [3]:
# Function to preprocess datasets
def preprocess_data(df):
    df['local_time'] = pd.to_datetime(df['local_time'])
    for time_unit in ['Year', 'Month', 'Day', 'Hour']:
        df[time_unit] = getattr(df['local_time'].dt, time_unit.lower())
    return df.drop('local_time', axis=1)

In [4]:
# Preprocess all datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
validation_df = preprocess_data(validation_df)

In [5]:
# Prepare the feature sets and target variables
def get_features_targets(df):
    X = df[['Year', 'Month', 'Day', 'Hour', 'Average_Temp']]
    y = df['MW']
    return X, y

X_train, y_train = get_features_targets(train_df)
X_validation, y_validation = get_features_targets(validation_df)
X_test, y_test = get_features_targets(test_df)

In [6]:
# Train Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [7]:
# Evaluation Function

def calculate_metrics(actual, predicted, lower_bound=0, upper_bound=100, iqr_multiplier=1.5):
    # Excluding negative actual values if considered invalid
    valid_indices = actual > lower_bound
    actual = actual[valid_indices]
    predicted = predicted[valid_indices]

    # Calculate MAE and RMSE
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))

    # Thresholding for outlier exclusion based on IQR
    q1, q3 = np.percentile(actual, [25, 75])
    iqr = q3 - q1
    outlier_threshold_upper = q3 + (iqr * iqr_multiplier)
    outlier_threshold_lower = q1 - (iqr * iqr_multiplier)

    valid_indices_for_mape = (actual >= outlier_threshold_lower) & (actual <= outlier_threshold_upper)
    filtered_actual = actual[valid_indices_for_mape]
    filtered_predicted = predicted[valid_indices_for_mape]

    # Calculate Modified MAPE with capped at 100%
    if len(filtered_actual) > 0:
        percentage_errors = np.abs((filtered_predicted - filtered_actual) / filtered_actual) * 100
        percentage_errors = np.clip(percentage_errors, None, upper_bound)  # Cap percentage errors at upper_bound (100%)
        mape = np.mean(percentage_errors)
    else:
        mape = np.nan

    # Calculate sMAPE
    smape = 100/len(actual) * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

    return mae, mape, smape, rmse

In [8]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
from joblib import dump, load

# Creating a pipeline that first standardizes the dataset
pipeline = make_pipeline(StandardScaler(), Ridge())

alpha_values = np.logspace(-1, 20, 10000)  # Adjust the number of points as needed

# Setting up the GridSearchCV object with 5-fold cross-validation
parameters = {'ridge__alpha': alpha_values}

grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='neg_mean_absolute_error', verbose=2)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Retrieving the best model from grid search
best_model_Ridge = grid_search.best_estimator_



# Save the model to a file
model_filename = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_ridge_model.joblib'
dump(best_model_Ridge, model_filename)


Fitting 5 folds for each of 10000 candidates, totalling 50000 fits
[CV] END ...................................ridge__alpha=0.1; total time=   0.0s
[CV] END ...................................ridge__alpha=0.1; total time=   0.0s
[CV] END ...................................ridge__alpha=0.1; total time=   0.0s
[CV] END ...................................ridge__alpha=0.1; total time=   0.0s
[CV] END ...................................ridge__alpha=0.1; total time=   0.0s
[CV] END ...................ridge__alpha=0.10048476241819601; total time=   0.0s
[CV] END ...................ridge__alpha=0.10048476241819601; total time=   0.0s
[CV] END ...................ridge__alpha=0.10048476241819601; total time=   0.0s
[CV] END ...................ridge__alpha=0.10048476241819601; total time=   0.0s
[CV] END ...................ridge__alpha=0.10048476241819601; total time=   0.0s
[CV] END ...................ridge__alpha=0.10097187478241294; total time=   0.0s
[CV] END ...................ridge__alpha=0

['/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_ridge_model.joblib']

In [9]:
# Predict on the validation set using the best model
best_model_Ridge_loaded = load(model_filename)
predictions_validation = best_model_Ridge_loaded.predict(X_validation)
metrics_validation = calculate_metrics(y_validation, predictions_validation)
print("Validation Metrics after tuning (MAE, MAPE, sMAPE,RMSE):", metrics_validation)

Validation Metrics after tuning (MAE, MAPE, sMAPE,RMSE): (61.312543711645766, 80.09419990590402, 75.82722609801434, 69.09915371925315)


In [10]:
# Evaluation on test dataset.
predictions_test = best_model_Ridge_loaded.predict(X_test)
metrics_test = calculate_metrics(y_test, predictions_test)
print("Test Metrics with Ridge Regression (MAE, MAPE, sMAPE, RMSE):", metrics_test)

Test Metrics with Ridge Regression (MAE, MAPE, sMAPE, RMSE): (62.48264037141814, 89.83410674288425, 75.93957166404067, 65.64436686508466)
