In [1]:
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import *
from datetime import datetime, timedelta
from tqdm import tqdm

Load Data

In [2]:
fold = 1
data_train=pd.read_csv(f'share_info/5-fold/train_fold{fold}.csv')
data_test=pd.read_csv(f'share_info/5-fold/test_fold{fold}.csv')

# Add feature

Time stamp

In [3]:
data_train['DATETIME'] = pd.DatetimeIndex( data_train['DATETIME'])
data_test['DATETIME'] = pd.DatetimeIndex( data_test['DATETIME'])

In [4]:
data_train['month'] = pd.DatetimeIndex( data_train['DATETIME']).month
data_train['day'] = pd.DatetimeIndex( data_train['DATETIME']).day
data_train['hour'] = pd.DatetimeIndex( data_train['DATETIME']).hour
data_train['minute'] = pd.DatetimeIndex( data_train['DATETIME']).minute

data_test['month'] = pd.DatetimeIndex( data_test['DATETIME']).month
data_test['day'] = pd.DatetimeIndex( data_test['DATETIME']).day
data_test['hour'] = pd.DatetimeIndex( data_test['DATETIME']).hour
data_test['minute'] = pd.DatetimeIndex( data_test['DATETIME']).minute

EMA

In [5]:
data_train['EMA_08'] = data_train['P(t)'].ewm(alpha=0.8).mean()
data_test['EMA_08'] = data_test['P(t)'].ewm(alpha=0.8).mean()

I_clear sky

In [6]:
# Iclr.ashrae = K.*exp(-B./sza);  K = 1663.52224574355; B = 0.739550805574049;
# coefs. are from fitting with selected clearsky data
def clearsky_cal(x):
    if x > 0:
        return 1663.52225*2.71828**(-0.73955/x)
    else:
        return 0

In [7]:
for i in range(1,9):
    data_train['I_clr(t+{})'.format(i)] = data_train['sza(t+{})'.format(i)].apply(clearsky_cal)
    data_test['I_clr(t+{})'.format(i)] = data_test['sza(t+{})'.format(i)].apply(clearsky_cal)

# Prepare X, y

In [8]:
feature_cols = ['P(t)', 'P(t-1)','P(t-2)', 'P(t-3)', 'P(t-4)', 'P(t-5)'
                , 'P(t-6)', 'P(t-7)',\
                'I(t)', 'I(t-1)','I(t-2)','I(t-3)','I(t-4)','I(t-5)','I(t-6)','I(t-7)',
                'P^(d-1)(t+1)', 'P^(d-1)(t+2)', 'P^(d-1)(t+3)','P^(d-1)(t+4)',\
                'P^(d-1)(t+5)', 'P^(d-1)(t+6)', 'P^(d-1)(t+7)','P^(d-1)(t+8)', \
                'sza(t+1)','sza(t+2)','sza(t+3)','sza(t+4)','sza(t+5)','sza(t+6)','sza(t+7)','sza(t+8)' \
               ,'day', 'month', 'minute', 'EMA_08',
               'I_clr(t+1)','I_clr(t+2)','I_clr(t+3)','I_clr(t+4)','I_clr(t+5)','I_clr(t+6)','I_clr(t+7)',
                'I_clr(t+8)']

In [9]:
targets = ['P(t+1)','P(t+2)','P(t+3)','P(t+4)','P(t+5)','P(t+6)','P(t+7)','P(t+8)']

In [10]:
data_train = data_train.set_index('DATETIME')
data_train_dropna = data_train[feature_cols+targets].dropna()

In [11]:
data_test = data_test.set_index('DATETIME')
data_test_dropna = data_test[feature_cols+targets+['time']].dropna()

# Model

In [12]:
from sklearn import preprocessing
def normalizer_std(X):
    scaler = preprocessing.StandardScaler().fit(X)
    return scaler

In [13]:
def pre_processing(train_data,test_data,feature,label, std = True):
    
    x_train = train_data[feature].values
    y_train = train_data[label].values 
    
    x_test = test_data[feature].values
    y_test = test_data[label].values
    
    if std :
        x_scaler = normalizer_std(x_train)
        x_train = x_scaler.transform(x_train)
        x_test = x_scaler.transform(x_test)
        
    return x_train, x_test, y_train, y_test

In [14]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

## Prepare feature for each model
morning, midday, evening

## morning 6:00 - 9:30
<p> feature - focus on feature that indicate trend at the time of forecasts Iclr(t+k), I(d-1)(t+k), sza(t+k)
<p>I(t+1) : execution time = 5.30 to 9.00 -- include I(t)
<p>I(t+2) : execution time = 5.30 to 8.30
<p>I(t+3) : execution time = 5.30 to 8.00
<p>I(t+4) : execution time = 5.30 to 7.30
<p>I(t+5) : execution time = 5.30 to 7.00
<p>I(t+6) : execution time = 5.30 to 6.30
<p>I(t+7) : execution time = 5.30 to 6.00
<p>I(t+8) : execution time = 5.30

In [15]:
# Create mapping feature and label for each forecast horizon model
labels =  ['P(t+1)','P(t+2)','P(t+3)','P(t+4)','P(t+5)','P(t+6)','P(t+7)','P(t+8)']
feature_dict_m = dict()
label_dict_m = dict()
for i in range(8):
    feature_dict_m['feature_{}'.format(i+1)] =   [f'sza(t+{i+1})',f'P^(d-1)(t+{i+1})',f'I_clr(t+{i+1})', 'EMA_08', 'P(t)']
    label_dict_m['label_{}'.format(i+1)] = [labels[i]]

In [16]:
train_m1 = data_train.between_time('5:30','9:00')[feature_dict_m[f'feature_1'] + label_dict_m[f'label_1']]
train_m2 = data_train.between_time('5:30','8:30')[feature_dict_m[f'feature_2'] + label_dict_m[f'label_2']]
train_m3 = data_train.between_time('5:30','8:00')[feature_dict_m[f'feature_3'] + label_dict_m[f'label_3']]
train_m4 = data_train.between_time('5:30','7:30')[feature_dict_m[f'feature_4'] + label_dict_m[f'label_4']]
train_m5 = data_train.between_time('5:30','7:00')[feature_dict_m[f'feature_5'] + label_dict_m[f'label_5']]
train_m6 = data_train.between_time('5:30','6:30')[feature_dict_m[f'feature_6'] + label_dict_m[f'label_6']]
train_m7 = data_train.between_time('5:30','6:00')[feature_dict_m[f'feature_7'] + label_dict_m[f'label_7']]
train_m8 = data_train.between_time('5:30','5:30')[feature_dict_m[f'feature_8'] + label_dict_m[f'label_8']]

In [17]:
test_m1 = data_test.between_time('5:30','9:00')[feature_dict_m[f'feature_1'] + label_dict_m[f'label_1']]
test_m2 = data_test.between_time('5:30','8:30')[feature_dict_m[f'feature_2'] + label_dict_m[f'label_2']]
test_m3 = data_test.between_time('5:30','8:00')[feature_dict_m[f'feature_3'] + label_dict_m[f'label_3']]
test_m4 = data_test.between_time('5:30','7:30')[feature_dict_m[f'feature_4'] + label_dict_m[f'label_4']]
test_m5 = data_test.between_time('5:30','7:00')[feature_dict_m[f'feature_5'] + label_dict_m[f'label_5']]
test_m6 = data_test.between_time('5:30','6:30')[feature_dict_m[f'feature_6'] + label_dict_m[f'label_6']]
test_m7 = data_test.between_time('5:30','6:00')[feature_dict_m[f'feature_7'] + label_dict_m[f'label_7']]
test_m8 = data_test.between_time('5:30','5:30')[feature_dict_m[f'feature_8'] + label_dict_m[f'label_8']]

In [18]:
yhat_train_m = []
yhat_test_m = []
idx_train_m = []
idx_test_m = []
y_train_m = []
y_test_m= []

train_list_m = [train_m1, train_m2, train_m3, train_m4, train_m5, train_m6, train_m7, train_m8]
test_list_m = [test_m1, test_m2, test_m3, test_m4, test_m5, test_m6, test_m7, test_m8]
i = 0
for train,test in tqdm(zip(train_list_m, test_list_m)):
    
    x_train, x_test, y_train, y_test = pre_processing(train,test,feature_dict_m[f'feature_{i+1}'],
                                                      label_dict_m[f'label_{i+1}'], std = False)
    reg = RandomForestRegressor(n_estimators = 1000, max_depth=10,
                                random_state = 1, min_samples_split=34, min_samples_leaf=16)
    reg.fit(x_train, y_train)

    yhat_train_m.append(reg.predict(x_train))
    yhat_test_m.append(reg.predict(x_test))
    
    y_train_m.append(y_train)
    y_test_m.append(y_test)
    
    idx_train_m.append(train.index)
    idx_test_m.append(test.index)
    i+=1

8it [00:41,  5.16s/it]


In [19]:
yhat_train_m1 = pd.DataFrame(yhat_train_m[0],columns=['P(t+1)'], index = idx_train_m[0])
yhat_train_m2 = pd.DataFrame(yhat_train_m[1],columns=['P(t+2)'], index = idx_train_m[1])
yhat_train_m3 = pd.DataFrame(yhat_train_m[2],columns=['P(t+3)'], index = idx_train_m[2])
yhat_train_m4 = pd.DataFrame(yhat_train_m[3],columns=['P(t+4)'], index = idx_train_m[3])
yhat_train_m5 = pd.DataFrame(yhat_train_m[4],columns=['P(t+5)'], index = idx_train_m[4])
yhat_train_m6 = pd.DataFrame(yhat_train_m[5],columns=['P(t+6)'], index = idx_train_m[5])
yhat_train_m7 = pd.DataFrame(yhat_train_m[6],columns=['P(t+7)'], index = idx_train_m[6])
yhat_train_m8 = pd.DataFrame(yhat_train_m[7],columns=['P(t+8)'], index = idx_train_m[7])

y_train_m1 = pd.DataFrame(y_train_m[0],columns=['P(t+1)'], index = idx_train_m[0])
y_train_m2 = pd.DataFrame(y_train_m[1],columns=['P(t+2)'], index = idx_train_m[1])
y_train_m3 = pd.DataFrame(y_train_m[2],columns=['P(t+3)'], index = idx_train_m[2])
y_train_m4 = pd.DataFrame(y_train_m[3],columns=['P(t+4)'], index = idx_train_m[3])
y_train_m5 = pd.DataFrame(y_train_m[4],columns=['P(t+5)'], index = idx_train_m[4])
y_train_m6 = pd.DataFrame(y_train_m[5],columns=['P(t+6)'], index = idx_train_m[5])
y_train_m7 = pd.DataFrame(y_train_m[6],columns=['P(t+7)'], index = idx_train_m[6])
y_train_m8 = pd.DataFrame(y_train_m[7],columns=['P(t+8)'], index = idx_train_m[7])

In [20]:
yhat_test_m1 = pd.DataFrame(yhat_test_m[0],columns=['P(t+1)'], index = idx_test_m[0])
yhat_test_m2 = pd.DataFrame(yhat_test_m[1],columns=['P(t+2)'], index = idx_test_m[1])
yhat_test_m3 = pd.DataFrame(yhat_test_m[2],columns=['P(t+3)'], index = idx_test_m[2])
yhat_test_m4 = pd.DataFrame(yhat_test_m[3],columns=['P(t+4)'], index = idx_test_m[3])
yhat_test_m5 = pd.DataFrame(yhat_test_m[4],columns=['P(t+5)'], index = idx_test_m[4])
yhat_test_m6 = pd.DataFrame(yhat_test_m[5],columns=['P(t+6)'], index = idx_test_m[5])
yhat_test_m7 = pd.DataFrame(yhat_test_m[6],columns=['P(t+7)'], index = idx_test_m[6])
yhat_test_m8 = pd.DataFrame(yhat_test_m[7],columns=['P(t+8)'], index = idx_test_m[7])

y_test_m1 = pd.DataFrame(y_test_m[0],columns=['P(t+1)'], index = idx_test_m[0])
y_test_m2 = pd.DataFrame(y_test_m[1],columns=['P(t+2)'], index = idx_test_m[1])
y_test_m3 = pd.DataFrame(y_test_m[2],columns=['P(t+3)'], index = idx_test_m[2])
y_test_m4 = pd.DataFrame(y_test_m[3],columns=['P(t+4)'], index = idx_test_m[3])
y_test_m5 = pd.DataFrame(y_test_m[4],columns=['P(t+5)'], index = idx_test_m[4])
y_test_m6 = pd.DataFrame(y_test_m[5],columns=['P(t+6)'], index = idx_test_m[5])
y_test_m7 = pd.DataFrame(y_test_m[6],columns=['P(t+7)'], index = idx_test_m[6])
y_test_m8 = pd.DataFrame(y_test_m[7],columns=['P(t+8)'], index = idx_test_m[7])

## afternoon 10:00 - 15.30
feature - focus on both feature that indicate trend Iclr(t+k), I(d-1)(t+k), sza(t+k) and dynamic I(t),RH(t),..
<p>I(t+1) : execution time = 9.30 to 15.00
<p>I(t+2) : execution time = 9.00 to 14.30
<p>I(t+3) : execution time = 8.30 to 14.00
<p>I(t+4) : execution time = 8.00 to 13.30
<p>I(t+5) : execution time = 7.30 to 13.00
<p>I(t+6) : execution time = 7.00 to 12.30
<p>I(t+7) : execution time = 6.30 to 12.00
<p>I(t+8) : execution time = 6.00 to 11.30

In [21]:
labels = ['P(t+1)','P(t+2)','P(t+3)','P(t+4)','P(t+5)','P(t+6)','P(t+7)','P(t+8)']
feature_dict_a = dict()
label_dict_a = dict()
for i in range(8):
    feature_dict_a[f'feature_{i+1}'] =   [f'sza(t+{i+1})',f'P^(d-1)(t+{i+1})',f'I_clr(t+{i+1})',
                                                  'EMA_08','P(t)', 'P(t-1)', 'P(t-2)', 'P(t-3)', 'P(t-4)']
    label_dict_a[f'label_{i+1}'] = [labels[i]]

In [22]:
train_a1 = data_train.between_time('9:30','15:00')[feature_dict_a[f'feature_1'] + label_dict_a[f'label_1']]
train_a2 = data_train.between_time('9:00','14:30')[feature_dict_a[f'feature_2'] + label_dict_a[f'label_2']]
train_a3 = data_train.between_time('8:30','14:00')[feature_dict_a[f'feature_3'] + label_dict_a[f'label_3']]
train_a4 = data_train.between_time('8:00','13:30')[feature_dict_a[f'feature_4'] + label_dict_a[f'label_4']]
train_a5 = data_train.between_time('7:30','13:00')[feature_dict_a[f'feature_5'] + label_dict_a[f'label_5']]
train_a6 = data_train.between_time('7:00','12:30')[feature_dict_a[f'feature_6'] + label_dict_a[f'label_6']]
train_a7 = data_train.between_time('6:30','12:00')[feature_dict_a[f'feature_7'] + label_dict_a[f'label_7']]
train_a8 = data_train.between_time('6:00','11:30')[feature_dict_a[f'feature_8'] + label_dict_a[f'label_8']]

In [23]:
test_a1 = data_test.between_time('9:30','15:00')[feature_dict_a[f'feature_1'] + label_dict_a[f'label_1']]
test_a2 = data_test.between_time('9:00','14:30')[feature_dict_a[f'feature_2'] + label_dict_a[f'label_2']]
test_a3 = data_test.between_time('8:30','14:00')[feature_dict_a[f'feature_3'] + label_dict_a[f'label_3']]
test_a4 = data_test.between_time('8:00','13:30')[feature_dict_a[f'feature_4'] + label_dict_a[f'label_4']]
test_a5 = data_test.between_time('7:30','13:00')[feature_dict_a[f'feature_5'] + label_dict_a[f'label_5']]
test_a6 = data_test.between_time('7:00','12:30')[feature_dict_a[f'feature_6'] + label_dict_a[f'label_6']]
test_a7 = data_test.between_time('6:30','12:00')[feature_dict_a[f'feature_7'] + label_dict_a[f'label_7']]
test_a8 = data_test.between_time('6:00','11:30')[feature_dict_a[f'feature_8'] + label_dict_a[f'label_8']]

In [24]:
yhat_train_a = []
yhat_test_a = []
idx_train_a = []
idx_test_a = []
y_train_a = []
y_test_a = []

train_list_a = [train_a1, train_a2, train_a3, train_a4, train_a5, train_a6, train_a7, train_a8]
test_list_a = [test_a1, test_a2, test_a3, test_a4, test_a5, test_a6, test_a7, test_a8]
i = 0
for train,test in tqdm(zip(train_list_a, test_list_a)):
    x_train, x_test, y_train, y_test = pre_processing(train,test,feature_dict_a[f'feature_{i+1}'],
                                                      label_dict_a[f'label_{i+1}'], std = False)
    reg = RandomForestRegressor(n_estimators = 1000, max_depth=10,
                                random_state = 1, min_samples_split=34, min_samples_leaf=16)
    reg.fit(x_train, y_train.ravel())
    
    yhat_train_a.append(reg.predict(x_train))
    yhat_test_a.append(reg.predict(x_test))
    
    y_train_a.append(y_train)
    y_test_a.append(y_test)   
    
    idx_train_a.append(train.index)
    idx_test_a.append(test.index)
    i+=1

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
8it [03:17, 24.67s/it]


In [25]:
yhat_train_a1 = pd.DataFrame(yhat_train_a[0],columns=['P(t+1)'], index = idx_train_a[0])
yhat_train_a2 = pd.DataFrame(yhat_train_a[1],columns=['P(t+2)'], index = idx_train_a[1])
yhat_train_a3 = pd.DataFrame(yhat_train_a[2],columns=['P(t+3)'], index = idx_train_a[2])
yhat_train_a4 = pd.DataFrame(yhat_train_a[3],columns=['P(t+4)'], index = idx_train_a[3])
yhat_train_a5 = pd.DataFrame(yhat_train_a[4],columns=['P(t+5)'], index = idx_train_a[4])
yhat_train_a6 = pd.DataFrame(yhat_train_a[5],columns=['P(t+6)'], index = idx_train_a[5])
yhat_train_a7 = pd.DataFrame(yhat_train_a[6],columns=['P(t+7)'], index = idx_train_a[6])
yhat_train_a8 = pd.DataFrame(yhat_train_a[7],columns=['P(t+8)'], index = idx_train_a[7])

y_train_a1 = pd.DataFrame(y_train_a[0],columns=['P(t+1)'], index = idx_train_a[0])
y_train_a2 = pd.DataFrame(y_train_a[1],columns=['P(t+2)'], index = idx_train_a[1])
y_train_a3 = pd.DataFrame(y_train_a[2],columns=['P(t+3)'], index = idx_train_a[2])
y_train_a4 = pd.DataFrame(y_train_a[3],columns=['P(t+4)'], index = idx_train_a[3])
y_train_a5 = pd.DataFrame(y_train_a[4],columns=['P(t+5)'], index = idx_train_a[4])
y_train_a6 = pd.DataFrame(y_train_a[5],columns=['P(t+6)'], index = idx_train_a[5])
y_train_a7 = pd.DataFrame(y_train_a[6],columns=['P(t+7)'], index = idx_train_a[6])
y_train_a8 = pd.DataFrame(y_train_a[7],columns=['P(t+8)'], index = idx_train_a[7])

In [26]:
yhat_test_a1 = pd.DataFrame(yhat_test_a[0],columns=['P(t+1)'], index = idx_test_a[0])
yhat_test_a2 = pd.DataFrame(yhat_test_a[1],columns=['P(t+2)'], index = idx_test_a[1])
yhat_test_a3 = pd.DataFrame(yhat_test_a[2],columns=['P(t+3)'], index = idx_test_a[2])
yhat_test_a4 = pd.DataFrame(yhat_test_a[3],columns=['P(t+4)'], index = idx_test_a[3])
yhat_test_a5 = pd.DataFrame(yhat_test_a[4],columns=['P(t+5)'], index = idx_test_a[4])
yhat_test_a6 = pd.DataFrame(yhat_test_a[5],columns=['P(t+6)'], index = idx_test_a[5])
yhat_test_a7 = pd.DataFrame(yhat_test_a[6],columns=['P(t+7)'], index = idx_test_a[6])
yhat_test_a8 = pd.DataFrame(yhat_test_a[7],columns=['P(t+8)'], index = idx_test_a[7])

y_test_a1 = pd.DataFrame(y_test_a[0],columns=['P(t+1)'], index = idx_test_a[0])
y_test_a2 = pd.DataFrame(y_test_a[1],columns=['P(t+2)'], index = idx_test_a[1])
y_test_a3 = pd.DataFrame(y_test_a[2],columns=['P(t+3)'], index = idx_test_a[2])
y_test_a4 = pd.DataFrame(y_test_a[3],columns=['P(t+4)'], index = idx_test_a[3])
y_test_a5 = pd.DataFrame(y_test_a[4],columns=['P(t+5)'], index = idx_test_a[4])
y_test_a6 = pd.DataFrame(y_test_a[5],columns=['P(t+6)'], index = idx_test_a[5])
y_test_a7 = pd.DataFrame(y_test_a[6],columns=['P(t+7)'], index = idx_test_a[6])
y_test_a8 = pd.DataFrame(y_test_a[7],columns=['P(t+8)'], index = idx_test_a[7])

## evening 16:00 - 18:00
feature - focus on  feature that indicate trend Iclr(t+k), I(d-1)(t+k), sza(t+k) 
<p>I(t+1) : execution time = 15.30 to 17.30
<p>I(t+2) : execution time = 15.00 to 17.30
<p>I(t+3) : execution time = 14.30 to 17.30
<p>I(t+4) : execution time = 14.00 to 17.30
<p>I(t+5) : execution time = 13.30 to 17.30
<p>I(t+6) : execution time = 13.00 to 17.30
<p>I(t+7) : execution time = 12.30 to 17.30
<p>I(t+8) : execution time = 12.00 to 17.30

In [27]:
labels =  ['P(t+1)','P(t+2)','P(t+3)','P(t+4)','P(t+5)','P(t+6)','P(t+7)','P(t+8)']
feature_dict_e = dict()
label_dict_e = dict()
for i in range(8):
    feature_dict_e['feature_{}'.format(i+1)] =  [f'sza(t+{i+1})',f'P^(d-1)(t+{i+1})',f'I_clr(t+{i+1})', 'EMA_08', 'P(t)']
    label_dict_e['label_{}'.format(i+1)] = [labels[i]]
feature_dict_e['feature_1'].append('I(t)')

In [28]:
train_e1 = data_train.between_time('15:30','17:30')[feature_dict_e[f'feature_1'] + label_dict_e[f'label_1']]
train_e2 = data_train.between_time('15:00','17:30')[feature_dict_e[f'feature_2'] + label_dict_e[f'label_2']]
train_e3 = data_train.between_time('14:30','17:30')[feature_dict_e[f'feature_3'] + label_dict_e[f'label_3']]
train_e4 = data_train.between_time('14:00','17:30')[feature_dict_e[f'feature_4'] + label_dict_e[f'label_4']]
train_e5 = data_train.between_time('13:30','17:30')[feature_dict_e[f'feature_5'] + label_dict_e[f'label_5']]
train_e6 = data_train.between_time('13:00','17:30')[feature_dict_e[f'feature_6'] + label_dict_e[f'label_6']]
train_e7 = data_train.between_time('12:30','17:30')[feature_dict_e[f'feature_7'] + label_dict_e[f'label_7']]
train_e8 = data_train.between_time('12:00','17:30')[feature_dict_e[f'feature_8'] + label_dict_e[f'label_8']]

In [29]:
test_e1 = data_test.between_time('15:30','17:30')[feature_dict_e[f'feature_1'] + label_dict_e[f'label_1']]
test_e2 = data_test.between_time('15:00','17:30')[feature_dict_e[f'feature_2'] + label_dict_e[f'label_2']]
test_e3 = data_test.between_time('14:30','17:30')[feature_dict_e[f'feature_3'] + label_dict_e[f'label_3']]
test_e4 = data_test.between_time('14:00','17:30')[feature_dict_e[f'feature_4'] + label_dict_e[f'label_4']]
test_e5 = data_test.between_time('13:30','17:30')[feature_dict_e[f'feature_5'] + label_dict_e[f'label_5']]
test_e6 = data_test.between_time('13:00','17:30')[feature_dict_e[f'feature_6'] + label_dict_e[f'label_6']]
test_e7 = data_test.between_time('12:30','17:30')[feature_dict_e[f'feature_7'] + label_dict_e[f'label_7']]
test_e8 = data_test.between_time('12:00','17:30')[feature_dict_e[f'feature_8'] + label_dict_e[f'label_8']]

In [30]:
yhat_train_e = []
yhat_test_e = []
idx_train_e = []
idx_test_e = []
y_train_e = []
y_test_e = []
 
train_list_e = [train_e1, train_e2, train_e3, train_e4, train_e5, train_e6, train_e7, train_e8]
test_list_e = [test_e1, test_e2, test_e3, test_e4, test_e5, test_e6, test_e7, test_e8]
i = 0
for train,test in tqdm(zip(train_list_e, test_list_e)):
    x_train, x_test, y_train, y_test = pre_processing(train,test,feature_dict_e[f'feature_{i+1}'],
                                                      label_dict_e[f'label_{i+1}'], std = False)
    reg = RandomForestRegressor(n_estimators = 1000, max_depth=10,
                                random_state = 1, min_samples_split=34, min_samples_leaf=16)
    reg.fit(x_train, y_train.ravel())
    
    yhat_train_e.append(reg.predict(x_train))
    yhat_test_e.append(reg.predict(x_test))
    
    y_train_e.append(y_train)
    y_test_e.append(y_test)
    
    idx_train_e.append(train.index)
    idx_test_e.append(test.index)
    i+=1

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
8it [01:05,  8.21s/it]


In [31]:
yhat_train_e1 = pd.DataFrame(yhat_train_e[0],columns=['P(t+1)'], index = idx_train_e[0])
yhat_train_e2 = pd.DataFrame(yhat_train_e[1],columns=['P(t+2)'], index = idx_train_e[1])
yhat_train_e3 = pd.DataFrame(yhat_train_e[2],columns=['P(t+3)'], index = idx_train_e[2])
yhat_train_e4 = pd.DataFrame(yhat_train_e[3],columns=['P(t+4)'], index = idx_train_e[3])
yhat_train_e5 = pd.DataFrame(yhat_train_e[4],columns=['P(t+5)'], index = idx_train_e[4])
yhat_train_e6 = pd.DataFrame(yhat_train_e[5],columns=['P(t+6)'], index = idx_train_e[5])
yhat_train_e7 = pd.DataFrame(yhat_train_e[6],columns=['P(t+7)'], index = idx_train_e[6])
yhat_train_e8 = pd.DataFrame(yhat_train_e[7],columns=['P(t+8)'], index = idx_train_e[7])

y_train_e1 = pd.DataFrame(y_train_e[0],columns=['P(t+1)'], index = idx_train_e[0])
y_train_e2 = pd.DataFrame(y_train_e[1],columns=['P(t+2)'], index = idx_train_e[1])
y_train_e3 = pd.DataFrame(y_train_e[2],columns=['P(t+3)'], index = idx_train_e[2])
y_train_e4 = pd.DataFrame(y_train_e[3],columns=['P(t+4)'], index = idx_train_e[3])
y_train_e5 = pd.DataFrame(y_train_e[4],columns=['P(t+5)'], index = idx_train_e[4])
y_train_e6 = pd.DataFrame(y_train_e[5],columns=['P(t+6)'], index = idx_train_e[5])
y_train_e7 = pd.DataFrame(y_train_e[6],columns=['P(t+7)'], index = idx_train_e[6])
y_train_e8 = pd.DataFrame(y_train_e[7],columns=['P(t+8)'], index = idx_train_e[7])

In [32]:
yhat_test_e1 = pd.DataFrame(yhat_test_e[0],columns=['P(t+1)'], index = idx_test_e[0])
yhat_test_e2 = pd.DataFrame(yhat_test_e[1],columns=['P(t+2)'], index = idx_test_e[1])
yhat_test_e3 = pd.DataFrame(yhat_test_e[2],columns=['P(t+3)'], index = idx_test_e[2])
yhat_test_e4 = pd.DataFrame(yhat_test_e[3],columns=['P(t+4)'], index = idx_test_e[3])
yhat_test_e5 = pd.DataFrame(yhat_test_e[4],columns=['P(t+5)'], index = idx_test_e[4])
yhat_test_e6 = pd.DataFrame(yhat_test_e[5],columns=['P(t+6)'], index = idx_test_e[5])
yhat_test_e7 = pd.DataFrame(yhat_test_e[6],columns=['P(t+7)'], index = idx_test_e[6])
yhat_test_e8 = pd.DataFrame(yhat_test_e[7],columns=['P(t+8)'], index = idx_test_e[7])

y_test_e1 = pd.DataFrame(y_test_e[0],columns=['P(t+1)'], index = idx_test_e[0])
y_test_e2 = pd.DataFrame(y_test_e[1],columns=['P(t+2)'], index = idx_test_e[1])
y_test_e3 = pd.DataFrame(y_test_e[2],columns=['P(t+3)'], index = idx_test_e[2])
y_test_e4 = pd.DataFrame(y_test_e[3],columns=['P(t+4)'], index = idx_test_e[3])
y_test_e5 = pd.DataFrame(y_test_e[4],columns=['P(t+5)'], index = idx_test_e[4])
y_test_e6 = pd.DataFrame(y_test_e[5],columns=['P(t+6)'], index = idx_test_e[5])
y_test_e7 = pd.DataFrame(y_test_e[6],columns=['P(t+7)'], index = idx_test_e[6])
y_test_e8 = pd.DataFrame(y_test_e[7],columns=['P(t+8)'], index = idx_test_e[7])

## get yhat dataframe : merge all yhat

In [33]:
from functools import reduce

In [34]:
yhat_train1 = pd.concat([yhat_train_m1,yhat_train_a1,yhat_train_e1]).sort_index()
yhat_train2 = pd.concat([yhat_train_m2,yhat_train_a2,yhat_train_e2]).sort_index()
yhat_train3 = pd.concat([yhat_train_m3,yhat_train_a3,yhat_train_e3]).sort_index()
yhat_train4 = pd.concat([yhat_train_m4,yhat_train_a4,yhat_train_e4]).sort_index()
yhat_train5 = pd.concat([yhat_train_m5,yhat_train_a5,yhat_train_e5]).sort_index()
yhat_train6 = pd.concat([yhat_train_m6,yhat_train_a6,yhat_train_e6]).sort_index()
yhat_train7 = pd.concat([yhat_train_m7,yhat_train_a7,yhat_train_e7]).sort_index()
yhat_train8 = pd.concat([yhat_train_m8,yhat_train_a8,yhat_train_e8]).sort_index()

y_train1 = pd.concat([y_train_m1,y_train_a1,y_train_e1]).sort_index()
y_train2 = pd.concat([y_train_m2,y_train_a2,y_train_e2]).sort_index()
y_train3 = pd.concat([y_train_m3,y_train_a3,y_train_e3]).sort_index()
y_train4 = pd.concat([y_train_m4,y_train_a4,y_train_e4]).sort_index()
y_train5 = pd.concat([y_train_m5,y_train_a5,y_train_e5]).sort_index()
y_train6 = pd.concat([y_train_m6,y_train_a6,y_train_e6]).sort_index()
y_train7 = pd.concat([y_train_m7,y_train_a7,y_train_e7]).sort_index()
y_train8 = pd.concat([y_train_m8,y_train_a8,y_train_e8]).sort_index()

yhat_test1 = pd.concat([yhat_test_m1, yhat_test_a1, yhat_test_e1]).sort_index()
yhat_test2 = pd.concat([yhat_test_m2, yhat_test_a2, yhat_test_e2]).sort_index()
yhat_test3 = pd.concat([yhat_test_m3, yhat_test_a3, yhat_test_e3]).sort_index()
yhat_test4 = pd.concat([yhat_test_m4, yhat_test_a4, yhat_test_e4]).sort_index()
yhat_test5 = pd.concat([yhat_test_m5, yhat_test_a5, yhat_test_e5]).sort_index()
yhat_test6 = pd.concat([yhat_test_m6, yhat_test_a6, yhat_test_e6]).sort_index()
yhat_test7 = pd.concat([yhat_test_m7, yhat_test_a7, yhat_test_e7]).sort_index()
yhat_test8 = pd.concat([yhat_test_m8, yhat_test_a8, yhat_test_e8]).sort_index()

y_test1 = pd.concat([y_test_m1, y_test_a1, y_test_e1]).sort_index()
y_test2 = pd.concat([y_test_m2, y_test_a2, y_test_e2]).sort_index()
y_test3 = pd.concat([y_test_m3, y_test_a3, y_test_e3]).sort_index()
y_test4 = pd.concat([y_test_m4, y_test_a4, y_test_e4]).sort_index()
y_test5 = pd.concat([y_test_m5, y_test_a5, y_test_e5]).sort_index()
y_test6 = pd.concat([y_test_m6, y_test_a6, y_test_e6]).sort_index()
y_test7 = pd.concat([y_test_m7, y_test_a7, y_test_e7]).sort_index()
y_test8 = pd.concat([y_test_m8, y_test_a8, y_test_e8]).sort_index()

In [35]:
dfhat_train = [yhat_train1, yhat_train2, yhat_train3, yhat_train4, yhat_train5, yhat_train6, yhat_train7, yhat_train8]
dfhat_test = [yhat_test1, yhat_test2, yhat_test3, yhat_test4, yhat_test5, yhat_test6, yhat_test7, yhat_test8]

df_train = [y_train1, y_train2, y_train3, y_train4, y_train5, y_train6, y_train7, y_train8]
df_test = [y_test1, y_test2, y_test3, y_test4, y_test5, y_test6, y_test7, y_test8]

y_hat_train = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), dfhat_train)
y_hat_test = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), dfhat_test)

y_train = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), df_train)
y_test = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), df_test)

In [36]:
y_hat_train.dropna(inplace = True)
y_hat_test.dropna(inplace = True)
y_train.dropna(inplace = True)
y_test.dropna(inplace = True)

# Error analysis

In [37]:
def RMSETable_1( y_train, y_hat_train, y_test, y_hat_test, targets, train = True ):
    time_list = ['5:30','6:00','6:30','7:00','7:30','8:00','8:30','9:00','9:30','10:00','10:30','11:00','11:30',\
            '12:00','12:30','13:00','13:30','14:00','14:30','15:00','15:30','16:00','16:30','17:00', '17:30']
    table = pd.DataFrame({'Execution Time': time_list})
    for i in targets:
        temp1 = []
        temp2 = []
        for j in time_list:
            temp1.append (np.sqrt(MSE(y_train[i].between_time(j, j), y_hat_train[i].between_time(j, j)))/8*100)
            temp2.append (np.sqrt(MSE(y_test[i].between_time(j, j), y_hat_test[i].between_time(j, j)))/8*100)
        if train == True:
            table[i+'RMSE_train'] = temp1
        table[i+'RMSE_test'] = temp2
    return table

In [38]:
def RMSETable_2( y_hat_train, y_hat_test, train_on = True):
    time_list = ['6:00','6:30','7:00','7:30','8:00','8:30','9:00','9:30','10:00','10:30','11:00','11:30',\
            '12:00','12:30','13:00','13:30','14:00','14:30','15:00','15:30','16:00','16:30','17:00', '17:30']
    table = pd.DataFrame({'Time of forecasted value P(t)': time_list})
    for i in range(1, 9):
        temp1 = []
        temp2 = []
        for j in time_list:
            if train_on == True:
                temp1.append (np.sqrt(np.nanmean( (y_hat_train['Step-{}'.format(i)].between_time(j, j) - y_hat_train['P_mea'].between_time(j, j))**2))/8*100)
            temp2.append (np.sqrt(np.nanmean( (y_hat_test['Step-{}'.format(i)].between_time(j, j) - y_hat_test['P_mea'].between_time(j, j))**2))/8*100)
        if train_on == True:
            table['Step-{}_train'.format(i)] = temp1
        table['Step-{}_test'.format(i)] = temp2
    return table

In [39]:
def RMSESummary_1( y_train, y_hat_train, y_test, y_hat_test, targets):
    table = pd.DataFrame({'Step': targets})
    train=[]
    test=[]
    for i in range(1,9):
        train.append (np.sqrt(MSE(y_train['P(t+{})'.format(i)], y_hat_train['P(t+{})'.format(i)]))/8*100)
        test.append (np.sqrt(MSE(y_test['P(t+{})'.format(i)], y_hat_test['P(t+{})'.format(i)]))/8*100)
    table['train'] = train
    table['test'] = test      
    return table

In [40]:
def table_transform( table_input ):
    table_input['DATE'] = table_input.index.date
    table = pd.DataFrame({'Time of forecasted value': table_input.index})
    for i in range(1,9):
        table['Step-{}'.format(i)] = table_input.groupby('DATE').shift(i)['P(t+{})'.format(i)].values
    table = table.set_index('Time of forecasted value')
    return table

In [41]:
RMSE_table_1 = RMSETable_1(y_train, y_hat_train, y_test, y_hat_test, targets, train =False)
RMSE_table_sum = RMSESummary_1(y_train, y_hat_train, y_test, y_hat_test, targets)

In [42]:
Y_train_T = table_transform(y_train)
y_hat_train_T = table_transform(y_hat_train)
Y_test_T = table_transform(y_test)
y_hat_test_T = table_transform(y_hat_test)

In [43]:
Y_train_T['P_mea'] = data_train['P(t)']
y_hat_train_T['P_mea'] = data_train['P(t)']
Y_test_T['P_mea'] = data_test['P(t)']
y_hat_test_T['P_mea'] = data_test['P(t)']

In [44]:
RMSE_table_2 = RMSETable_2(y_hat_train_T, y_hat_test_T, train_on =False )

  # Remove the CWD from sys.path while we load stuff.


## save