In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Loading the datasets
train_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_test.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_Validation.csv')

In [4]:

# Assuming df is your DataFrame
null_values = validation_df[validation_df.isnull().any(axis=1)]
print(null_values)


Empty DataFrame
Columns: [local_time, Average_Temp, MW]
Index: []


In [5]:
def preprocess_data_for_arima(df):
    df['local_time'] = pd.to_datetime(df['local_time'])
    df = df.set_index('local_time')  # Set the datetime as the index
    return df[['MW']]  # Return only the target variable

# Preprocessing all datasets
train_df = preprocess_data_for_arima(train_df)
test_df = preprocess_data_for_arima(test_df)
validation_df = preprocess_data_for_arima(validation_df)


In [None]:
train_df.head()


Unnamed: 0_level_0,MW
local_time,Unnamed: 1_level_1
2021-01-01 00:00:00,34.02723
2021-01-01 01:00:00,32.25537
2021-01-01 02:00:00,33.48902
2021-01-01 03:00:00,32.7913
2021-01-01 04:00:00,34.10638


In [6]:
# No explicit feature extraction needed for ARIMA
y_train = train_df['MW']
y_validation = validation_df['MW']
y_test = test_df['MW']


In [None]:
# Assuming df is your DataFrame
null_values = y_validation[validation_df.isnull().any(axis=1)]
print(null_values)

Series([], Name: MW, dtype: float64)


In [7]:
# Evaluation Function

def calculate_metrics(actual, predicted, lower_bound=0, upper_bound=100, iqr_multiplier=1.5):
    # Excluding negative actual values if considered invalid
    valid_indices = actual > lower_bound
    actual = actual[valid_indices]
    predicted = predicted[valid_indices]

    # MAE and RMSE calculations
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))

    # Thresholding for outlier exclusion based on IQR
    q1, q3 = np.percentile(actual, [25, 75])
    iqr = q3 - q1
    outlier_threshold_upper = q3 + (iqr * iqr_multiplier)
    outlier_threshold_lower = q1 - (iqr * iqr_multiplier)

    valid_indices_for_mape = (actual >= outlier_threshold_lower) & (actual <= outlier_threshold_upper)
    filtered_actual = actual[valid_indices_for_mape]
    filtered_predicted = predicted[valid_indices_for_mape]

    # MAPE will be capped at 100 if it goes above 100%
    if len(filtered_actual) > 0:
        percentage_errors = np.abs((filtered_predicted - filtered_actual) / filtered_actual) * 100
        percentage_errors = np.clip(percentage_errors, None, upper_bound)
        mape = np.mean(percentage_errors)
    else:
        mape = np.nan

    # sMAPE calculation
    smape = 100/len(actual) * np.sum(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

    return mae, mape, smape, rmse

In [9]:
!pip install pmdarima


Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/2.1 MB[0m [31m15.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [11]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Finding the best ARIMA model parameters for the training data
best_model = auto_arima(y_train, start_p=1, start_q=1,
                        test='adf',
                        max_p=3, max_q=3,
                        m=1,
                        d=None,
                        seasonal=False,
                        start_P=0,
                        D=0,
                        trace=True,
                        error_action='ignore',
                        suppress_warnings=True,
                        stepwise=True)


print(best_model.summary())

# Save your model
model_save_path = '/content/drive/MyDrive/Saved_trained_models/arima_model_today.joblib'
dump(best_model, model_save_path)


Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=153080.712, Time=1.05 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=209420.543, Time=0.22 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=156417.272, Time=0.31 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=188373.478, Time=0.98 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=153021.415, Time=1.91 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=153534.803, Time=0.46 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=151014.012, Time=4.61 sec
 ARIMA(3,0,0)(0,0,0)[0]             : AIC=152813.305, Time=0.48 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=151026.259, Time=6.46 sec
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=151656.087, Time=5.30 sec
 ARIMA(3,0,1)(0,0,0)[0] intercept   : AIC=151006.210, Time=30.81 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=152501.719, Time=9.41 sec
 ARIMA(3,0,0)(0,0,0)[0] intercept   : AIC=152386.027, Time=2.41 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=150960.208, Time=38.44 sec
 AR

['/content/drive/MyDrive/Saved_trained_models/arima_model_today.joblib']

In [14]:
from joblib import dump, load
# Save your model
model_save_path = '/content/drive/MyDrive/Saved_trained_models/arima_model_today.joblib'
# To load your model
loaded_model = load(model_save_path)

In [15]:
# For validation set
n_periods_validation = len(y_validation)
predictions_validation = loaded_model.predict(n_periods=n_periods_validation)

# For test set
n_periods_test = len(y_test)
predictions_test = loaded_model.predict(n_periods=n_periods_test)


  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(
  return get_prediction_index(


In [18]:
print(y_validation.isnull())

local_time
2022-12-31 23:00:00    False
2023-01-01 00:00:00    False
2023-01-01 01:00:00    False
2023-01-01 02:00:00    False
2023-01-01 03:00:00    False
                       ...  
2023-12-31 19:00:00    False
2023-12-31 20:00:00    False
2023-12-31 21:00:00    False
2023-12-31 22:00:00    False
2023-12-31 23:00:00    False
Name: MW, Length: 8761, dtype: bool


In [16]:
# For validation set
predictions_validation_series = pd.Series(predictions_validation.values, index=y_validation.index)
print("Predictions for validation set:", predictions_validation_series)

# For test set
predictions_test_series = pd.Series(predictions_test.values, index=y_test.index)
print("Predictions for test set:", predictions_test_series)

# Check for missing values in predictions
nan_check_validation = pd.isnull(predictions_validation_series).any()
nan_check_test = pd.isnull(predictions_test_series).any()
print("Missing values in predictions for validation set:", nan_check_validation)
print("Missing values in predictions for test set:", nan_check_test)

# Handle missing values
if nan_check_validation or nan_check_test:
    print("Missing values detected. Handle them appropriately.")
else:
    # Calculate metrics for validation set
    metrics_validation = calculate_metrics(y_validation, predictions_validation_series)
    print("Validation Metrics (MAE, MAPE, sMAPE, RMSE):", metrics_validation)

    # Calculate metrics for test set
    metrics_test = calculate_metrics(y_test, predictions_test_series)
    print("Test Metrics (MAE, MAPE, sMAPE, RMSE):", metrics_test)


Predictions for validation set: local_time
2022-12-31 23:00:00    113.605504
2023-01-01 00:00:00    108.760623
2023-01-01 01:00:00    105.669337
2023-01-01 02:00:00    104.197735
2023-01-01 03:00:00    104.115486
                          ...    
2023-12-31 19:00:00     72.283162
2023-12-31 20:00:00     72.283162
2023-12-31 21:00:00     72.283162
2023-12-31 22:00:00     72.283162
2023-12-31 23:00:00     72.283162
Length: 8761, dtype: float64
Predictions for test set: local_time
2024-01-01 00:00:00    113.605504
2024-01-01 01:00:00    108.760623
2024-01-01 02:00:00    105.669337
2024-01-01 03:00:00    104.197735
2024-01-01 04:00:00    104.115486
                          ...    
2024-02-29 19:00:00     72.286867
2024-02-29 20:00:00     72.286842
2024-02-29 21:00:00     72.286818
2024-02-29 22:00:00     72.286794
2024-02-29 23:00:00     72.286770
Length: 1440, dtype: float64
Missing values in predictions for validation set: False
Missing values in predictions for test set: False
Validati

In [17]:
import pandas as pd
import matplotlib.pyplot as plt

# Loading the 'local_time' column from the test dataset
test_df_for_plotting = pd.read_csv('/content/drive/MyDrive/FYP_2024/Final_test.csv', usecols=['local_time'])
test_local_time = pd.to_datetime(test_df_for_plotting['local_time'])

if len(test_local_time) > len(predictions_test_series):
    test_local_time = test_local_time[:len(predictions_test_series)]

# Convert 'test_local_time' to datetime if it's not already
test_local_time = pd.to_datetime(test_local_time)

# Filter for January
january_start = "2024-01-01"
january_end = "2024-01-31"
is_january = (test_local_time >= january_start) & (test_local_time <= january_end)
january_times = test_local_time[is_january]
january_actual = y_test[is_january]
january_predictions = predictions_test_series[is_january]

# Filter for February
february_start = "2024-02-01"
february_end = "2024-02-28"
is_february = (test_local_time >= february_start) & (test_local_time <= february_end)
february_times = test_local_time[is_february]
february_actual = y_test[is_february]
february_predictions = predictions_test_series[is_february]

# Set up the plot layout
plt.figure(figsize=(12, 12))
gs = gridspec.GridSpec(3, 2)  # Define a grid of 3 rows and 2 columns

# Main plot spanning the first row
ax0 = plt.subplot(gs[0, :])  # This subplot takes up the first row
ax0.plot(test_local_time, y_test, label='Actual $/MW', color='blue')
ax0.plot(test_local_time, predictions_test_series, label='Predicted $/MW', color='red', linestyle='--')
ax0.set_title('Actual vs Predicted $/MW')
ax0.set_xlabel('Local Time')
ax0.set_ylabel('$/MW')
ax0.legend()
ax0.tick_params(axis='x', rotation=45)

# January subplot in the second row, first column
ax1 = plt.subplot(gs[1, 0])  # Bottom left
ax1.plot(january_times, january_actual, label='Actual $/MW', color='blue')
ax1.plot(january_times, january_predictions, label='Predicted $/MW', color='red', linestyle='--')
ax1.set_title('January: Actual vs Predicted')
ax1.set_xlabel('Local Time')
ax1.set_ylabel('$/MW')
ax1.tick_params(axis='x', rotation=45)

# February subplot in the second row, second column
ax2 = plt.subplot(gs[1, 1])  # Bottom right
ax2.plot(february_times, february_actual, label='Actual $/MW', color='blue')
ax2.plot(february_times, february_predictions, label='Predicted $/MW', color='red', linestyle='--')
ax2.set_title('February: Actual vs Predicted')
ax2.set_xlabel('Local Time')
ax2.set_ylabel('$/MW')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()  # Adjust layout to make room for all elements
plt.show()






IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [None]:
import matplotlib.pyplot as plt

# Assuming test_metrics is structured as [MAE, MAPE, sMAPE, RMSE]
mape = test_metrics[1]
accuracy = 100 - mape  # Calculating accuracy as the complement of MAPE

# Data for plotting
labels = ['MAPE', 'Accuracy']
sizes = [mape, accuracy]
colors = ['#FFA07A', '#ADD8E6']
explode = (0.1, 0)  # Only explode the MAPE slice

plt.figure(figsize=(8, 8))
wedges, texts, autotexts = plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, pctdistance=0.85)
plt.axis('equal')
plt.title('Model Performance Visualization (MAPE)')
plt.show()