In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
# Loading the datasets
train_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Train.csv')
test_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_test.csv')
validation_df = pd.read_csv('/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/Final_Validation.csv')

In [3]:
# Function to preprocess datasets
def preprocess_data(df):
    df['local_time'] = pd.to_datetime(df['local_time'])
    for time_unit in ['Year', 'Month', 'Day', 'Hour']:
        df[time_unit] = getattr(df['local_time'].dt, time_unit.lower())
    return df.drop('local_time', axis=1)

In [4]:
# Preprocess all datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
validation_df = preprocess_data(validation_df)

def add_lagged_features(df, n_lags=3):
    for lag in range(1, n_lags + 1):
        df[f'Average_Temp_lag_{lag}'] = df['Average_Temp'].shift(lag)
    return df.dropna().reset_index(drop=True)

train_df = add_lagged_features(train_df)
validation_df = add_lagged_features(validation_df)
test_df = add_lagged_features(test_df)


In [5]:
# Prepare the feature sets and target variables
def get_features_targets(df):
    feature_cols = ['Year', 'Month', 'Day', 'Hour', 'Average_Temp'] + [f'Average_Temp_lag_{lag}' for lag in range(1, 4)]  # Assuming 3 lags
    X = df[feature_cols]
    y = df['MW']
    return X, y


X_train, y_train = get_features_targets(train_df)
X_validation, y_validation = get_features_targets(validation_df)
X_test, y_test = get_features_targets(test_df)

In [6]:
# Evaluation Function

def calculate_metrics(actual, predicted, lower_bound=0, upper_bound=100, iqr_multiplier=1.5):
    # Ignoring negative actual values if considered invalid
    valid_indices = actual > lower_bound
    actual = actual[valid_indices]
    predicted = predicted[valid_indices]

    # Calculate MAE and RMSE
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))

    # Thresholding for outlier exclusion based on IQR
    q1, q3 = np.percentile(actual, [25, 75])
    iqr = q3 - q1
    outlier_threshold_upper = q3 + (iqr * iqr_multiplier)
    outlier_threshold_lower = q1 - (iqr * iqr_multiplier)

    valid_indices_for_mape = (actual >= outlier_threshold_lower) & (actual <= outlier_threshold_upper)
    filtered_actual = actual[valid_indices_for_mape]
    filtered_predicted = predicted[valid_indices_for_mape]

    # Calculate Modified MAPE with capped at 100%
    if len(filtered_actual) > 0:
        percentage_errors = np.abs((filtered_predicted - filtered_actual) / filtered_actual) * 100
        percentage_errors = np.clip(percentage_errors, None, upper_bound)  # Cap percentage errors at upper_bound (100%)
        mape = np.mean(percentage_errors)
    else:
        mape = np.nan

    # Calculate sMAPE
    smape = 100/len(actual) * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

    return mae, mape, smape, rmse


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from joblib import dump

param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}


grid_search = GridSearchCV(Lasso(), param_grid, cv=10, scoring='neg_mean_absolute_error', verbose=1)

grid_search.fit(X_train, y_train)

# Extracting the best Lasso model
best_lasso_model = grid_search.best_estimator_

print("Best model parameters:", grid_search.best_params_)
print("Best model score:", grid_search.best_score_)

# Save the model to a file
model_filename = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_lasso_model_V2.joblib'
dump(best_lasso_model, model_filename)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best model parameters: {'alpha': 1}
Best model score: -33.03922415244774


['/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_lasso_model_V2.joblib']

In [8]:
from joblib import load
model_filename = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/notebooks/Trained_Models/best_lasso_model_V2.joblib'
best_lasso_model = load(model_filename)

In [9]:
# Making predictions with the best model on the validation dataset
y_validation_pred = best_lasso_model.predict(X_validation)
validation_metrics = calculate_metrics(y_validation, y_validation_pred)
print(f"Validation Metrics: MAE={validation_metrics[0]}, MAPE={validation_metrics[1]}, sMAPE={validation_metrics[2]}, RMSE={validation_metrics[3]}")

Validation Metrics: MAE=71.28286083726182, MAPE=84.00579062161529, sMAPE=82.01137699268766, RMSE=79.26918832397006


In [10]:
# Making predictions with the best model on the test dataset
y_test_pred = best_lasso_model.predict(X_test)
test_metrics = calculate_metrics(y_test, y_test_pred)
print(f"Test Metrics: MAE={test_metrics[0]}, MAPE={test_metrics[1]}, sMAPE={test_metrics[2]}, RMSE={test_metrics[3]}")

Test Metrics: MAE=69.31973372580401, MAPE=92.32680071761845, sMAPE=80.70576560541684, RMSE=72.03742105847527
