In [None]:
import numpy as np
import pandas as pd
import os
import math
import datetime
import matplotlib.pyplot as plt

from pandas.tseries.offsets import DateOffset
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# dirs
DATA_DIR = "./load.csv"

In [None]:
data = pd.read_csv(DATA_DIR)
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%Y/%m/%d %H:%M')
data['Load'] = data['Load'] * 4


In [None]:
# scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data['Load'].to_numpy().reshape(-1, 1))
data['Load'] = data_scaled

In [None]:
# Define a function to generate a list of timestamps every 2 hours within the dataset's range
def generate_timestamps(data) -> pd.DatetimeIndex:
    start = data['Timestamp'].min() + DateOffset(days=7)
    end = data['Timestamp'].max() - DateOffset(hours=3)
    timestamps = pd.date_range(start=start, end=end, freq='15min')
    return timestamps


In [None]:
timestamps = generate_timestamps(data)
print(timestamps.shape)

In [None]:
def generate_sets_for_all_timestamps(timestamps, data):
    training_sets = []
    target_sets = []
    training_sets_time = []
    target_sets_time = []

    for timestamp in timestamps:
        # Calculate the range for the current period's data
        start_time_current = timestamp - DateOffset(days=2, hours=23, minutes=45)
        end_time_current = timestamp

        # Calculate the equivalent timestamp for last week
        # timestamp_last_week = timestamp - DateOffset(days=7)
        # start_time_last_week = timestamp_last_week - DateOffset(days=0, hours=23, minutes=45)
        # end_time_last_week = timestamp_last_week

        # Calculate the target range (the next 10 steps after the current timestamp)
        target_start_time = timestamp + DateOffset(minutes=15)
        target_end_time = timestamp + DateOffset(hours=2, minutes=30) 

        # Filter the data for training and target sets
        current_data = data[(data['Timestamp'] >= start_time_current) & (data['Timestamp'] <= end_time_current)]
        # last_week_data = data[(data['Timestamp'] >= start_time_last_week) & (data['Timestamp'] <= end_time_last_week)]
        target_data = data[(data['Timestamp'] >= target_start_time) & (data['Timestamp'] <= target_end_time)]

        # Combine current and last week data for the training set
        training_data = pd.concat([current_data]).reset_index(drop=True)
        
        # Save the training and target sets
        if not training_data.empty and not target_data.empty:
            training_sets.append(training_data['Load'])
            target_sets.append(target_data['Load'])
            training_sets_time.append(list(training_data['Timestamp']))
            target_sets_time.append(list(target_data['Timestamp']))

    training_sets = np.array(training_sets)
    target_sets = np.array(target_sets)
    training_sets_time = np.array(training_sets_time)
    target_sets_time = np.array(target_sets_time)

    return training_sets, target_sets, training_sets_time, target_sets_time


In [None]:
# Generate training and target sets for all the timestamps
training_sets, target_sets, training_sets_time, target_sets_time = generate_sets_for_all_timestamps(timestamps, data)

In [None]:
MONTH_TIME_STEP = math.floor(timestamps.shape[0] / 24)
X_test = []
y_test = []
X_test_time = []
y_test_time = []
minList = []
maxList = []
for i in range(0, 24):
    start = (i+1)*MONTH_TIME_STEP-(192*(i+1))
    end = (i+1)*MONTH_TIME_STEP-(192*i)
    X_test.append(training_sets[start:end])
    y_test.append(target_sets[start:end])
    X_test_time.append(training_sets_time[start:end])
    y_test_time.append(target_sets_time[start:end])
    training_sets = np.concatenate([training_sets[:start], training_sets[end:]])
    target_sets = np.concatenate([target_sets[:start], target_sets[end:]])
    training_sets_time = np.concatenate([training_sets_time[:start], training_sets_time[end:]])
    target_sets_time = np.concatenate([target_sets_time[:start], target_sets_time[end:]])


In [None]:
X_test = np.concatenate([i for i in X_test])
y_test = np.concatenate([i for i in y_test])
X_test_time = np.concatenate([i for i in X_test_time])
y_test_time = np.concatenate([i for i in y_test_time])

In [None]:
X_train = training_sets
X_test = X_test
y_train = target_sets
X_train_time = training_sets_time
y_train_time = target_sets_time


In [None]:
print(np.array(X_train).shape)
print(np.array(X_test).shape)
print(np.array(y_train).shape)
print(np.array(y_test).shape)


In [None]:
etr = ExtraTreesRegressor(n_jobs=-1)
multioutput_etr = MultiOutputRegressor(etr, n_jobs=-1)
multioutput_etr.fit(X_train, y_train)

In [None]:
y_pred = multioutput_etr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("-" * 86)
print(f'mse: {mse:.4f}')
print(f'rmse: {rmse:.4f}')
print(f'mae: {mae:.4f}')
print(f'r2: {r2:.4f}')
print("-" * 86)

In [None]:
TEST_PLOT_DIR = "./test_plots/etr_sliding_window/"
if not os.path.exists(TEST_PLOT_DIR):
    os.makedirs(TEST_PLOT_DIR)

In [None]:
pred_data = scaler.inverse_transform(y_pred)
actual_data = scaler.inverse_transform(y_test)
previous_data = scaler.inverse_transform(X_test[:, :])
for i in range(actual_data.shape[0]):
    plt.figure(figsize=(12, 6))
    X1 = np.concatenate((X_test_time[i][-30:], y_test_time[i]))
    y1 = np.concatenate((previous_data[i][-30:], actual_data[i]))
    X2 = y_test_time[i]
    y_p = pred_data[i]
    y_a = actual_data[i]
    Xh = np.full(100, X1[len(X1)-10])
    yh = np.arange(0, 100, 1)
    plt.title(f"Time Series {i+1} prediction")
    plt.plot(X1, y1, '--', color='#98afc7')
    plt.plot(X2, y_p, label='Predict')
    plt.plot(X2, y_a, label='Actual')
    plt.scatter(X2, y_p)
    plt.scatter(X2, y_a)
    plt.plot(Xh, yh, color='#4863a0', alpha=0.5)
    plt.ylim(0, 100)
    plt.xlabel('Time step')
    plt.ylabel('Usage (kWh)')
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.savefig(TEST_PLOT_DIR+f"Time_Series_{i+1}.png")
    plt.close()