In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import xgboost as xgb
import os
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
# dirs
DATA_DIR = "./load.csv"
TEST_PLOT_DIR = "./test_plots/svm_xgboost/"
TRAINING_HISTORY_DIR = "./training_history/svm_xgboost.png"

In [3]:
# constants
PREDICT_STEP = 96
INPUT_STEP = 288

In [4]:
if not os.path.exists(TEST_PLOT_DIR):
    os.makedirs(TEST_PLOT_DIR)
if not os.path.exists("./model"):
    os.makedirs("./model")
if not os.path.exists("./training_history"):
    os.makedirs("./training_history")

In [5]:
# Function to split data into train, validation, and test sets
def split_data(df, train_frac=0.70, test_frac=0.25):
    # Sort data by year_month
    grouped = df.groupby('year_month')

    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    val_data = pd.DataFrame()

    for name, group in grouped:
        n = len(group)
        train_end = int(train_frac * n)
        test_end = train_end + int(test_frac * n)   
        train_data = pd.concat([train_data, group.iloc[:train_end]], ignore_index=True)
        val_data = pd.concat([val_data, group.iloc[train_end:test_end]], ignore_index=True)
        test_data = pd.concat([test_data, group.iloc[test_end:]], ignore_index=True)
    # adding time_idx
    train_data['time_idx'] = np.arange(len(train_data))
    test_data['time_idx'] = np.arange(len(test_data))
    val_data['time_idx'] = np.arange(len(val_data))
    return train_data, test_data, val_data

In [6]:
def create_dataset(dataset, look_back, look_forward):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-look_forward+1):
        X = dataset[i:(i+look_back), :]
        y = dataset[(i+look_back):(i+look_back+look_forward), 0]
        dataX.append(X)
        dataY.append(y)
    return np.array(dataX).reshape(np.array(dataX).shape[0], np.array(dataX).shape[1]), np.array(dataY)

In [7]:
data = pd.read_csv(DATA_DIR)
# Convert the 'date' column to datetime format
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%Y/%m/%d %H:%M')

# Sort the data by date
data.sort_values('Timestamp', inplace=True)

# Extract month and year from the date for splitting
data['year_month'] = data['Timestamp'].dt.to_period('M')

# Splitting the data
train_df, test_df, val_df = split_data(data)

# create scaler
scaler = MinMaxScaler()
scaler.fit(np.array(data["Load"]).reshape(-1, 1))

In [8]:
display(train_df)

Unnamed: 0,Timestamp,Load,year_month,time_idx
0,2023-01-01 00:00:00,10.89,2023-01,0
1,2023-01-01 00:15:00,10.44,2023-01,1
2,2023-01-01 00:30:00,10.89,2023-01,2
3,2023-01-01 00:45:00,10.29,2023-01,3
4,2023-01-01 01:00:00,10.34,2023-01,4
...,...,...,...,...
24517,2023-12-22 15:30:00,6.57,2023-12,24517
24518,2023-12-22 15:45:00,8.22,2023-12,24518
24519,2023-12-22 16:00:00,7.25,2023-12,24519
24520,2023-12-22 16:15:00,7.05,2023-12,24520


In [9]:
X_train, y_train = create_dataset(scaler.transform(np.array(train_df['Load']).reshape(-1, 1)), INPUT_STEP, PREDICT_STEP)
X_val, y_val = create_dataset(scaler.transform(np.array(val_df['Load']).reshape(-1, 1)), INPUT_STEP, PREDICT_STEP)
X_test, y_test = create_dataset(scaler.transform(np.array(test_df['Load']).reshape(-1, 1)), INPUT_STEP, PREDICT_STEP)

In [10]:
print("-" * 86)
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")
print("-" * 86)

--------------------------------------------------------------------------------------
X_train: (24139, 288)
y_train: (24139, 96)
X_val: (8377, 288)
y_val: (8377, 96)
X_test: (1375, 288)
y_test: (1375, 96)
--------------------------------------------------------------------------------------


In [11]:
svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
multioutput_svr = MultiOutputRegressor(svr)
multioutput_svr.fit(X_train, y_train)

In [None]:
y_pred = multioutput_svr.predict(X_val)

In [None]:
# residual
res = y_val - y_pred


In [None]:
regressor = xgb.XGBRegressor(
    objective='reg:squarederror',  # Specify the regression objective
    n_estimators=2000,               # Number of boosting rounds (trees)
    learning_rate=0.001,              # Step size shrinkage to prevent overfitting
    max_depth=7,                    # Maximum depth of a tree
    colsample_bytree=0.8,           # Fraction of features used by each tree
    subsample=0.8,                  # Fraction of samples used for fitting each tree
)

In [None]:
regressor.fit(res)

In [None]:
prediction = multioutput_svr.predict(X_test)
residual = regressor.predict(prediction)

output = prediction + residual

In [None]:
loss = mean_squared_error(y_test, output)
print("-" * 86)
print(f'Test Loss: {loss:.4f}')
print("-" * 86)

X_test_reshaped = X_test[:, :, 0]
y_test_reshaped = np.reshape(y_test, (y_test.shape[0], y_test.shape[1]))

pred_data = np.concatenate(
    [scaler.inverse_transform(X_test_reshaped),
        scaler.inverse_transform(output)],
    axis=-1
)
actual_data = np.concatenate(
    [scaler.inverse_transform(X_test_reshaped),
        scaler.inverse_transform(y_test_reshaped)],
    axis=-1
)

for i in range(actual_data.shape[0]):
    plt.figure(figsize=(16, 6))
    X = np.arange(1, actual_data.shape[1]+1, 1)
    y_pred = pred_data[i]
    y_actual = actual_data[i]
    plt.title(f"Time Series {i+1} prediction result")
    plt.plot(X, y_pred, label='Predict')
    plt.plot(X, y_actual, label='Actual')
    plt.ylim(0, 30)
    plt.xlabel('Time step')
    plt.ylabel('Usage (kWh)')
    plt.legend()
    plt.savefig(TEST_PLOT_DIR+f"Time_Series_{i+1}.png")
    plt.close()