In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [14]:
# Load the datasets
train_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_test.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_Validation.csv')

In [15]:
# Function to preprocess datasets
def preprocess_data(df):
    df['local_time'] = pd.to_datetime(df['local_time'])
    for time_unit in ['Year', 'Month', 'Day', 'Hour']:
        df[time_unit] = getattr(df['local_time'].dt, time_unit.lower())
    return df.drop('local_time', axis=1)

In [16]:
# Preprocess all datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
validation_df = preprocess_data(validation_df)

def add_lagged_features(df, n_lags=3):
    for lag in range(1, n_lags + 1):
        df[f'Average_Temp_lag_{lag}'] = df['Average_Temp'].shift(lag)
    return df.dropna().reset_index(drop=True)

train_df = add_lagged_features(train_df)
validation_df = add_lagged_features(validation_df)
test_df = add_lagged_features(test_df)

In [17]:
train_df.head()


Unnamed: 0,Average_Temp,MW,Year,Month,Day,Hour,Average_Temp_lag_1,Average_Temp_lag_2,Average_Temp_lag_3
0,4.492,32.7913,2021,1,1,3,4.298,4.612,5.186
1,4.066,34.10638,2021,1,1,4,4.492,4.298,4.612
2,4.208,36.87143,2021,1,1,5,4.066,4.492,4.298
3,3.968,40.34799,2021,1,1,6,4.208,4.066,4.492
4,3.684,38.24962,2021,1,1,7,3.968,4.208,4.066


In [18]:
# Preparing the feature sets and target variables
def get_features_targets(df):
    feature_cols = ['Year', 'Month', 'Day', 'Hour', 'Average_Temp'] + [f'Average_Temp_lag_{lag}' for lag in range(1, 4)]  # Assuming 3 lags
    X = df[feature_cols]
    y = df['MW']
    return X, y


X_train, y_train = get_features_targets(train_df)
X_validation, y_validation = get_features_targets(validation_df)
X_test, y_test = get_features_targets(test_df)

In [19]:
# Evaluation Function

def calculate_metrics(actual, predicted, lower_bound=0, upper_bound=100, iqr_multiplier=1.5):
    # Excluding negative actual values if considered invalid
    valid_indices = actual > lower_bound
    actual = actual[valid_indices]
    predicted = predicted[valid_indices]

    # Calculate MAE and RMSE
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))

    # Thresholding for outlier exclusion based on IQR
    q1, q3 = np.percentile(actual, [25, 75])
    iqr = q3 - q1
    outlier_threshold_upper = q3 + (iqr * iqr_multiplier)
    outlier_threshold_lower = q1 - (iqr * iqr_multiplier)

    valid_indices_for_mape = (actual >= outlier_threshold_lower) & (actual <= outlier_threshold_upper)
    filtered_actual = actual[valid_indices_for_mape]
    filtered_predicted = predicted[valid_indices_for_mape]

    # Calculate Modified MAPE with capped at 100%
    if len(filtered_actual) > 0:
        percentage_errors = np.abs((filtered_predicted - filtered_actual) / filtered_actual) * 100
        percentage_errors = np.clip(percentage_errors, None, upper_bound)  # Cap percentage errors at upper_bound (100%)
        mape = np.mean(percentage_errors)
    else:
        mape = np.nan

    # Calculate sMAPE
    smape = 100/len(actual) * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

    return mae, mape, smape, rmse

In [8]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib  # Importing joblib for model saving

# Defining the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(max_iter=500))
])


parameter_space = {
    'mlp__hidden_layer_sizes': [(50, 50, 50, 50), (100, 100, 50, 50), (50, 100, 50, 25), (100, 100, 100, 50)],
    'mlp__alpha': np.logspace(-5, 3, 5)
}

# grid search
grid_search = GridSearchCV(pipeline, parameter_space, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_model_v2 = grid_search.best_estimator_

# Save the trained model to disk
model_save_path_v2 = '/content/drive/MyDrive/Saved_trained_models/best_mlp_model_v2.joblib'
joblib.dump(best_model_v2, model_save_path_v2)
print(f"Model saved to {model_save_path_v2}")


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Model saved to /content/drive/MyDrive/Saved_trained_models/best_mlp_model_v2.joblib


In [20]:
# Load the trained model from disk
best_model_v2 = joblib.load('/content/drive/MyDrive/Saved_trained_models/best_mlp_model_v2.joblib')




In [21]:
# Make predictions on the validation dataset
predictions_validation_v2 = best_model_v2.predict(X_validation)
metrics_validation_v2 = calculate_metrics(y_validation, predictions_validation_v2)
print("Validation Metrics for V2 DNN (MAE, MAPE, sMAPE, RMSE):", metrics_validation_v2)

Validation Metrics for V2 DNN (MAE, MAPE, sMAPE, RMSE): (79.15214503429571, 80.93607452691518, 80.41513789507931, 108.13411072126677)


In [22]:
# Make predictions on the test dataset
predictions_test_v2 = best_model_v2.predict(X_test)
metrics_test_v2 = calculate_metrics(y_test, predictions_test_v2)
print("Test Metrics for V2 DNN (MAE, MAPE, sMAPE, RMSE):", metrics_test_v2)

Test Metrics for V2 DNN (MAE, MAPE, sMAPE, RMSE): (72.13069003527694, 93.72749918088468, 82.74686109317011, 74.38025416237365)
