In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import os

In [2]:
df=pd.read_csv("CH22_Demand_XY_Train.csv")
print(f"the length of the dataframe is {len(df)}")

the length of the dataframe is 41932


In [3]:
df.head

<bound method NDFrame.head of                   DateTime        X1     X2      X3      X4             Y
0      2022-01-01 00:00:00  2.186333  13.76  0.0663  0.1547  521163.83540
1      2022-01-01 00:10:00  2.138000  13.90  0.0910  0.1105  449066.62018
2      2022-01-01 00:20:00  2.104333  13.90  0.0806  0.1300  437394.72159
3      2022-01-01 00:30:00  2.040333  14.00  0.1183  0.1248  422107.63292
4      2022-01-01 00:40:00  1.973667  14.14  0.0624  0.1105  406923.83540
...                    ...       ...    ...     ...     ...           ...
41927  2022-10-19 03:50:00  5.856667  17.66  0.1092  0.1391  365929.91028
41928  2022-10-19 04:00:00  5.860000  17.66  0.1183  0.1495  368822.51417
41929  2022-10-19 04:10:00  5.846667  17.68  0.1001  0.1976  373857.78769
41930  2022-10-19 04:20:00  5.856667  17.66  0.1183  0.1391  373536.38739
41931  2022-10-19 04:30:00  5.876667  17.68  0.0767  0.1690  376643.25826

[41932 rows x 6 columns]>

### Feature engineering

In [4]:
# steps:
# 1. use the train_data set to do feature engineer and get f1, f2, f3, f4
# 2. normalize the f1, f2, f3, f4 by mean_val, log_scale the y (save the mean_val)
# 3. split the train_data to train_set, val_set, test_set

In [5]:
# f1: the same time of last 7 days (extract from y);  shape=(7*1,)
# f2: the datetime as a categorical value (hh:mm); shape=(1,)
# f3: the previous two hour x1, x2, x3, x4 multi-value time series;  shape=(2*6,5)
# f4: the previous two hour y; shape=(2*6,1)

In [6]:
# feed f1, f3, f4 to two different GRU, get o1, o2, o3
# feed f2, o1, o2, o3 to MLP and get final output

In [7]:
X, y = df[['X1','X2','X3','X4']],df['Y']
time_point_week=7*24*6

In [8]:
# get the f1
f1=[]
for i in range(time_point_week,len(df)):
    last_week_demand = []
    for j in range(7):
        last_week_demand.append(y[i-time_point_week+j*24*6])
    f1.append(last_week_demand)
f1=np.array(f1)

In [9]:
# get the f2
time_label=[t[-8:-3] for t in df['DateTime']]
time2catg=dict(zip(time_label, range(len(set(time_label)))))
f2=[time2catg[t] for t in time_label[time_point_week:]]
f2=np.array(f2)

In [10]:
# get the f3, f4
f3=[]
f4=[]
for i in range(time_point_week,len(df)):
    last_two_hour = X[i-12:i].values
    last_two_hour_demand =y[i-12:i]
    f3.append(last_two_hour)
    f4.append(last_two_hour_demand)
f3=np.array(f3)
f4=np.array(f4)

In [11]:
def train_test_split(array, test_size):
    num_samples = len(array[0])
    permutation = np.random.permutation(range(num_samples))
    train_split,test_split = [], []
    test_set_size = int(num_samples*test_size)
    for arr in array:
        train_split.append(arr[permutation][:-test_set_size])
        test_split.append(arr[permutation][-test_set_size:])
    return train_split, test_split

In [12]:
def normalize(array):
    normalize_res=[]
    mu_arr=[]
    theta_arr=[]
    for arr in array:
        mu, theta =np.mean(arr,axis=0),np.max(arr,axis=0)-np.min(arr,axis=0)
        normalize_res.append((arr-mu)/theta)
        mu_arr.append(mu)
        theta_arr.append(theta)
    return tuple(normalize_res), mu_arr, theta_arr

In [13]:
(f1, f3, f4), mu_arr, theta_arr = normalize([f1, f3, f4])

In [14]:
# save 10% data as test set 
train_split, test_split = train_test_split(
      [np.log(y[time_point_week:]).values, f1, f2, f3, f4],test_size=0.1)

In [15]:
train_split, val_split = train_test_split(
      train_split,test_size=0.33)

In [16]:
os.mkdir("./processed/")

In [17]:
for name, arr in zip(["train","val","test"],[train_split, val_split, test_split]):
    for col,i in zip(['y','f1','f2','f3','f4'],arr):
        print(f"{name}_{col}: {i.shape}")
        np.save(f"./processed/{name}_{col}.npy",i)

train_y:(24678,)
train_f1:(24678, 7)
train_f2:(24678,)
train_f3:(24678, 12, 4)
train_f4:(24678, 12)
val_y:(12154,)
val_f1:(12154, 7)
val_f2:(12154,)
val_f3:(12154, 12, 4)
val_f4:(12154, 12)
test_y:(4092,)
test_f1:(4092, 7)
test_f2:(4092,)
test_f3:(4092, 12, 4)
test_f4:(4092, 12)


In [18]:
# print(r2_score(y_true, y_pred))