In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [10]:
def batch_generator(batch_size, sequence_length, num_x_signals, num_y_signals, num_train, x_train_scaled, y_train_scaled):
    
    # Infinite loop.
    while True:
        # Allocate a new array for the batch of input-signals.
        x_shape = (batch_size, sequence_length, num_x_signals)
        x_batch = np.zeros(shape=x_shape, dtype=np.float16)

        # Allocate a new array for the batch of output-signals.
        y_shape = (batch_size, sequence_length, num_y_signals)
        y_batch = np.zeros(shape=y_shape, dtype=np.float16)

        # Fill the batch with random sequences of data.
        for i in range(batch_size):
            # Get a random start-index.
            # This points somewhere into the training-data.
            idx = np.random.randint(num_train - sequence_length) # num_train = 299776
            
            # Copy the sequences of data starting at this index.
            x_batch[i] = x_train_scaled[idx:idx+sequence_length]
            y_batch[i] = y_train_scaled[idx:idx+sequence_length]
        
        yield (x_batch, y_batch)   

def organize_dat_v3(df, shift_steps=96, sequence_length=96*7*2):
    train_split = 0.9
    batch_size = 256

    # scalers
    feature_scaler = MinMaxScaler()
    load_scaler = MinMaxScaler()

    # shift for forecast
    #shift_steps = 1 * 24 * 4  # Number of time steps
    df_targets = df['Load (kW)'].shift(-shift_steps)

    # scale and adjust the length
    x_data = feature_scaler.fit_transform(df.values)[0:-shift_steps]
    y_data = load_scaler.fit_transform(df_targets.values[:-shift_steps,np.newaxis])
    #y_data = np.expand_dims(y_data,axis=1)

    num_data = len(x_data)
    num_train = int(train_split * num_data)
    num_test = num_data - num_train

    x_train = x_data[0:num_train]
    x_test = x_data[num_train:]
    len(x_train) + len(x_test)

    y_train = y_data[0:num_train]
    y_test = y_data[num_train:]
    len(y_train) + len(y_test)

    num_x_signals = x_data.shape[1]
    num_y_signals = y_data.shape[1]

    generator = batch_generator(  batch_size,
                                  sequence_length,
                                  num_x_signals,
                                  num_y_signals,
                                  num_train,
                                  x_train,
                                  y_train)

    x_batch, y_batch = next(generator)

    validation_data = ( np.expand_dims(x_test, axis=0),
                        np.expand_dims(y_test, axis=0))

    df = df.iloc[:-shift_steps, :]
    df = df.iloc[num_train:, :]
    return (num_x_signals, num_y_signals, generator, validation_data, load_scaler, df)

def rmse(y_true, y_pred):
  return np.sqrt(np.mean(np.square(y_true - y_pred)))

In [4]:
df = pd.read_csv('data/HabitatZEH_60min_processed2_mjw.csv',
                 comment='#',
                 index_col=0,
                 parse_dates=True)
df

Unnamed: 0_level_0,Load (kW)
Datetime MT,Unnamed: 1_level_1
2005-12-06 00:00:00,0.980
2005-12-06 01:00:00,0.980
2005-12-06 02:00:00,0.980
2005-12-06 03:00:00,0.980
2005-12-06 04:00:00,0.980
...,...
2022-05-03 05:00:00,1.639
2022-05-03 06:00:00,1.637
2022-05-03 07:00:00,0.639
2022-05-03 08:00:00,0.318


In [5]:
num_x_signals, num_y_signals, generator, validation_data, load_scaler, dfv = organize_dat_v3(df, shift_steps=24, sequence_length=24)

In [6]:
xv,yv = validation_data
yv.shape

(1, 14380, 1)

In [7]:
dfv

Unnamed: 0_level_0,Load (kW)
Datetime MT,Unnamed: 1_level_1
2020-09-10 06:00:00,0.136
2020-09-10 07:00:00,0.130
2020-09-10 08:00:00,0.133
2020-09-10 09:00:00,0.381
2020-09-10 10:00:00,0.400
...,...
2022-05-02 05:00:00,1.650
2022-05-02 06:00:00,2.543
2022-05-02 07:00:00,1.676
2022-05-02 08:00:00,1.499


In [14]:
dppd = {'H':24,'15T':96,'T':1440}[df.index.inferred_freq]
  
d = df['Load (kW)'].values.flatten()
rmse_np1d = rmse(d[(dppd*1):],d[:-(dppd*1)])
rmse_np7d = rmse(d[(dppd*7):],d[:-(dppd*7)])

if rmse_np1d < rmse_np7d:
    np_days = 1
else:
    np_days = 7

In [18]:
rmse(d,np.concatenate((d[dppd*np_days:],d[-(dppd*np_days):])))

0.7314991402154167