In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/ML_Dataset.csv'
original_dataset = pd.read_csv(file_path)

# Convert 'Datetime' to datetime and sort by this column
original_dataset['Datetime'] = pd.to_datetime(original_dataset['Datetime'])
original_dataset.sort_values('Datetime', inplace=True)

# Extracting time-related features from 'Datetime'
original_dataset['hour'] = original_dataset['Datetime'].dt.hour
original_dataset['day_of_week'] = original_dataset['Datetime'].dt.dayofweek

# Make two copies of the dataset
df1 = original_dataset.copy()  # For LEAR
df2 = original_dataset.copy()  # For DNN

# --- Preparing df1 for LEAR with lagged features ---
def create_lagged_features(df, target, lag_days):
    for lag in lag_days:
        df[f'{target}_lag_{lag}'] = df[target].shift(lag)
    return df.dropna()

# Creating lagged features for LEAR
lag_days = [1, 2, 7]
df1 = create_lagged_features(df1, 'MW', lag_days)

# Calculate indices for splitting (70% train, 15% validate, 15% test)
split_idx_lear_train = int(len(df1) * 0.7)
split_idx_lear_val = split_idx_lear_train + int(len(df1) * 0.15)

# Splitting df1 for LEAR
feature_columns_lear = ['Average_Temp', 'hour', 'day_of_week', 'MW'] + [f'MW_lag_{lag}' for lag in lag_days]
X_train_lear = df1.iloc[:split_idx_lear_train][feature_columns_lear]
y_train_lear = df1.iloc[:split_idx_lear_train]['MW']
X_val_lear = df1.iloc[split_idx_lear_train:split_idx_lear_val][feature_columns_lear]
y_val_lear = df1.iloc[split_idx_lear_train:split_idx_lear_val]['MW']
X_test_lear = df1.iloc[split_idx_lear_val:][feature_columns_lear]
y_test_lear = df1.iloc[split_idx_lear_val:]['MW']

# --- Preparing df2 for DNN with normalized features ---
feature_columns_dnn = ['Average_Temp', 'hour', 'day_of_week', 'MW']

# Calculate indices for splitting (similarly 70-15-15 split)
split_idx_dnn_train = int(len(df2) * 0.7)
split_idx_dnn_val = split_idx_dnn_train + int(len(df2) * 0.15)

# Splitting df2 for DNN
X_train_dnn = df2.iloc[:split_idx_dnn_train][feature_columns_dnn]
y_train_dnn = df2.iloc[:split_idx_dnn_train]['MW']
X_val_dnn = df2.iloc[split_idx_dnn_train:split_idx_dnn_val][feature_columns_dnn]
y_val_dnn = df2.iloc[split_idx_dnn_train:split_idx_dnn_val]['MW']
X_test_dnn = df2.iloc[split_idx_dnn_val:][feature_columns_dnn]
y_test_dnn = df2.iloc[split_idx_dnn_val:]['MW']

# Normalizing features for DNN
scaler = StandardScaler()
X_train_dnn_scaled = scaler.fit_transform(X_train_dnn)
X_val_dnn_scaled = scaler.transform(X_val_dnn)
X_test_dnn_scaled = scaler.transform(X_test_dnn)

# Print the first few rows of the training datasets for verification
print("LEAR Training Dataset Head:\n", X_train_lear.head())
print("DNN Training Dataset Head:\n", pd.DataFrame(X_train_dnn_scaled, columns=X_train_dnn.columns).head())


LEAR Training Dataset Head:
     Average_Temp  hour  day_of_week        MW  MW_lag_1  MW_lag_2  MW_lag_7
7          3.684     7            4  38.24962  40.34799  36.87143  34.02723
8          3.834     8            4  29.23187  38.24962  40.34799  32.25537
9          4.966     9            4  22.11242  29.23187  38.24962  33.48902
10         6.308    10            4  16.62852  22.11242  29.23187  32.79130
11         7.726    11            4  14.51898  16.62852  22.11242  34.10638
DNN Training Dataset Head:
    Average_Temp      hour  day_of_week        MW
0     -1.213493 -1.660760     0.496702 -0.613609
1     -1.300242 -1.516297     0.496702 -0.641057
2     -1.347697 -1.371834     0.496702 -0.621946
3     -1.318378 -1.227370     0.496702 -0.632755
4     -1.382760 -1.082907     0.496702 -0.612383


In [24]:
import pandas as pd

# Load the dataset
file_path = '/Users/alitahseen/Desktop/FYP-2024/Machine_learning/Datafiles/ML_Dataset.csv'
original_dataset = pd.read_csv(file_path)

# Convert 'Datetime' to datetime and sort by this column
original_dataset['Datetime'] = pd.to_datetime(original_dataset['Datetime'])
original_dataset.sort_values('Datetime', inplace=True)

# Extracting time-related features from 'Datetime'
original_dataset['hour'] = original_dataset['Datetime'].dt.hour
original_dataset['day_of_week'] = original_dataset['Datetime'].dt.dayofweek

# Make a copy of the dataset for LEAR
df1 = original_dataset.copy()

# Function to create lagged features
def create_lagged_features(df, target, lag_days):
    for lag in lag_days:
        df[f'{target}_lag_{lag}'] = df[target].shift(lag)
    return df.dropna()

# Creating lagged features for LEAR
lag_days = [1, 2, 7]
df1 = create_lagged_features(df1, 'MW', lag_days)

# Compare rows from the original and transformed datasets
# Align and compare the original and transformed datasets based on 'Datetime'
comparison = pd.merge(original_dataset, df1, on='Datetime', how='left', suffixes=('_orig', '_transformed'))

# Print the comparison for the first 10 rows
print(comparison.head(10))



       local_time_orig  Average_Temp_orig PNODE_RESMRID_orig GRP_TYPE_orig  \
0  2021-01-01 00:00:00              5.186   TH_NP15_GEN-APND   ALL_APNODES   
1  2021-01-01 01:00:00              4.612   TH_NP15_GEN-APND   ALL_APNODES   
2  2021-01-01 02:00:00              4.298   TH_NP15_GEN-APND   ALL_APNODES   
3  2021-01-01 03:00:00              4.492   TH_NP15_GEN-APND   ALL_APNODES   
4  2021-01-01 04:00:00              4.066   TH_NP15_GEN-APND   ALL_APNODES   
5  2021-01-01 05:00:00              4.208   TH_NP15_GEN-APND   ALL_APNODES   
6  2021-01-01 06:00:00              3.968   TH_NP15_GEN-APND   ALL_APNODES   
7  2021-01-01 07:00:00              3.684   TH_NP15_GEN-APND   ALL_APNODES   
8  2021-01-01 08:00:00              3.834   TH_NP15_GEN-APND   ALL_APNODES   
9  2021-01-01 09:00:00              4.966   TH_NP15_GEN-APND   ALL_APNODES   

   POS_orig   MW_orig  GROUP_orig            Datetime  hour_orig  \
0         0  34.02723           1 2021-01-01 00:00:00          0   
1    

In [25]:
# For LEAR datasets
print("LEAR Train Dataset:\n", X_train_lear.head())
print("LEAR Validation Dataset:\n", X_val_lear.head())
print("LEAR Test Dataset:\n", X_test_lear.head())


LEAR Train Dataset:
     Average_Temp  hour  day_of_week        MW  MW_lag_1  MW_lag_2  MW_lag_7
7          3.684     7            4  38.24962  40.34799  36.87143  34.02723
8          3.834     8            4  29.23187  38.24962  40.34799  32.25537
9          4.966     9            4  22.11242  29.23187  38.24962  33.48902
10         6.308    10            4  16.62852  22.11242  29.23187  32.79130
11         7.726    11            4  14.51898  16.62852  22.11242  34.10638
LEAR Validation Dataset:
        Average_Temp  hour  day_of_week        MW  MW_lag_1  MW_lag_2  MW_lag_7
18398        10.816    14            0  29.71000  29.51000  33.01000  68.42792
18399        10.982    15            0  34.73755  29.71000  29.51000  49.85000
18400        10.658    16            0  56.15052  34.73755  29.71000  46.33000
18401        10.172    17            0  81.13960  56.15052  34.73755  41.29164
18402         8.580    18            0  82.17708  81.13960  56.15052  38.76815
LEAR Test Dataset:
    

In [26]:
# For DNN datasets
print("DNN Train Dataset:\n", pd.DataFrame(X_train_dnn_scaled, columns=X_train_dnn.columns).head())
print("DNN Validation Dataset:\n", pd.DataFrame(X_val_dnn_scaled, columns=X_val_dnn.columns).head())
print("DNN Test Dataset:\n", pd.DataFrame(X_test_dnn_scaled, columns=X_test_dnn.columns).head())


DNN Train Dataset:
    Average_Temp      hour  day_of_week        MW
0     -1.213493 -1.660760     0.496702 -0.613609
1     -1.300242 -1.516297     0.496702 -0.641057
2     -1.347697 -1.371834     0.496702 -0.621946
3     -1.318378 -1.227370     0.496702 -0.632755
4     -1.382760 -1.082907     0.496702 -0.612383
DNN Validation Dataset:
    Average_Temp      hour  day_of_week        MW
0     -0.675465  0.072797     -1.50184 -0.629367
1     -0.503175  0.217260     -1.50184 -0.683586
2     -0.362622  0.361723     -1.50184 -0.680488
3     -0.337535  0.506186     -1.50184 -0.602605
4     -0.386501  0.650649     -1.50184 -0.270893
DNN Test Dataset:
    Average_Temp      hour  day_of_week        MW
0      2.082382  0.939575    -0.002933  1.338711
1      1.634127  1.084038    -0.002933  0.713017
2      1.375692  1.228501    -0.002933  0.291818
3      1.301940  1.372964    -0.002933  0.035805
4      1.179221  1.517427    -0.002933 -0.079935


In [27]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Initialize the Lasso model
lear_model = Lasso(alpha=0.1)  # Adjust the alpha parameter as needed

# Train the model on the training set
lear_model.fit(X_train_lear, y_train_lear)

# Make predictions on the test set
lear_predictions = lear_model.predict(X_test_lear)

# Calculate the Root Mean Squared Error (RMSE) for the LEAR model
lear_rmse = mean_squared_error(y_test_lear, lear_predictions, squared=False)
print(f"LEAR Model RMSE: {lear_rmse}")


LEAR Model RMSE: 0.01604265112202082


In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the DNN model architecture
dnn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_dnn_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

# Compile the DNN model
dnn_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the DNN model
dnn_history = dnn_model.fit(
    X_train_dnn_scaled, 
    y_train_dnn, 
    validation_data=(X_val_dnn_scaled, y_val_dnn), 
    epochs=100, 
    batch_size=32
)

# Evaluate the DNN model on the test set
dnn_loss = dnn_model.evaluate(X_test_dnn_scaled, y_test_dnn)
print(f"DNN Model Loss: {dnn_loss}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
from sklearn.metrics import mean_absolute_error

# Calculate MAE for LEAR model
lear_mae = mean_absolute_error(y_test_lear, lear_predictions)
print(f"LEAR Model MAE: {lear_mae}")


LEAR Model MAE: 0.005854369392270171


In [29]:
# Function to calculate MAPE and sMAPE with a safeguard for zero values
def mape(y_true, y_pred, epsilon=1e-8):
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def smape(y_true, y_pred, epsilon=1e-8):
    masked_y_true = y_true + epsilon
    return 2.0 * np.mean(np.abs(y_pred - y_true) / (np.abs(masked_y_true) + np.abs(y_pred))) * 100

# Calculate metrics for LEAR model
lear_mae = mean_absolute_error(y_test_lear, lear_predictions)
lear_rmse = np.sqrt(mean_squared_error(y_test_lear, lear_predictions))
lear_mape = mape(y_test_lear, lear_predictions)
lear_smape = smape(y_test_lear, lear_predictions)

# Make predictions with the DNN model
dnn_predictions = dnn_model.predict(X_test_dnn_scaled).flatten()

# Calculate metrics for DNN model
dnn_mae = mean_absolute_error(y_test_dnn, dnn_predictions)
dnn_rmse = np.sqrt(mean_squared_error(y_test_dnn, dnn_predictions))
dnn_mape = mape(y_test_dnn, dnn_predictions)
dnn_smape = smape(y_test_dnn, dnn_predictions)

# Print metrics for both models
print(f"LEAR Model MAE: {lear_mae}, RMSE: {lear_rmse}, MAPE: {lear_mape}%, sMAPE: {lear_smape}%")
print(f"DNN Model MAE: {dnn_mae}, RMSE: {dnn_rmse}, MAPE: {dnn_mape}%, sMAPE: {dnn_smape}%")


LEAR Model MAE: 0.005118332699650697, RMSE: 0.01604265112202082, MAPE: 0.007529578709421786%, sMAPE: 0.007529439343802482%
DNN Model MAE: 0.008061230883779729, RMSE: 0.010026841746419856, MAPE: 0.016359422832727632%, sMAPE: 0.01636049810801308%


In [32]:
# Combine the date, actual values, and predictions into a DataFrame
lear_comparison_df = pd.DataFrame({
    'Date': test_dates,
    'Actual LMP': y_test_lear,
    'Predicted LMP by LEAR': lear_predictions
})

dnn_comparison_df = pd.DataFrame({
    'Date': test_dates,
    'Actual LMP': y_test_dnn,
    'Predicted LMP by DNN': dnn_predictions
})

# Display the first few rows of the DataFrame
print("LEAR Model Predictions vs Actual:")
print(lear_comparison_df.head())

print("\nDNN Model Predictions vs Actual:")
print(dnn_comparison_df.head())


LEAR Model Predictions vs Actual:
                     Date  Actual LMP  Predicted LMP by LEAR
22338 2023-07-20 18:00:00   160.05493             160.025553
22339 2023-07-20 19:00:00   119.66468             119.734409
22340 2023-07-20 20:00:00    92.47507              92.467893
22341 2023-07-20 21:00:00    75.94872              75.942562
22342 2023-07-20 22:00:00    68.47739              68.471584

DNN Model Predictions vs Actual:
                     Date  Actual LMP  Predicted LMP by DNN
22338 2023-07-20 18:00:00   160.05493            160.047012
22339 2023-07-20 19:00:00   119.66468            119.658501
22340 2023-07-20 20:00:00    92.47507             92.469872
22341 2023-07-20 21:00:00    75.94872             75.945580
22342 2023-07-20 22:00:00    68.47739             68.478813


In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso

# Initialize the Lasso model with the same alpha as before
lear_model_cv = Lasso(alpha=0.1)

# Perform cross-validation
cross_val_scores = cross_val_score(lear_model_cv, df1[feature_columns_lear], df1['MW'], cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive values and take the square root to get RMSE
rmse_scores = np.sqrt(-cross_val_scores)
print(f"Cross-validated RMSE scores for LEAR: {rmse_scores}")
print(f"Mean RMSE: {np.mean(rmse_scores)}")


Cross-validated RMSE scores for LEAR: [0.02279702 0.01045968 0.02358765 0.01057559 0.01450298]
Mean RMSE: 0.016384583576223046


In [34]:
from sklearn.utils import shuffle

# Shuffle the test features
X_test_lear_shuffled = shuffle(X_test_lear, random_state=42)

# Make predictions on the shuffled test set
lear_predictions_shuffled = lear_model.predict(X_test_lear_shuffled)

# Compare with actual values
shuffled_comparison_df = pd.DataFrame({
    'Actual LMP': y_test_lear,
    'Predicted LMP on Shuffled Data by LEAR': lear_predictions_shuffled
})
print(shuffled_comparison_df.head())


       Actual LMP  Predicted LMP on Shuffled Data by LEAR
22338   160.05493                               62.387002
22339   119.66468                               59.284289
22340    92.47507                               48.256013
22341    75.94872                               69.080195
22342    68.47739                               48.901083
