In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/kaggle/input/transformersscript/')

### Investigating Transformers vs RNNs (GRU) using the Power Consumption Dataset

- We start off with importing the dataset, setting a time-series index and converting all values to numeric type

In [None]:
import pandas as pd
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
import numpy as np
from timeseries_preprocessing import timeseries_dataset_from_array
df1 = pd.read_csv("/kaggle/input/electric-power-consumption-data-set/household_power_consumption.txt",delimiter=";")
df1["datetime"] = df1[["Date","Time"]].apply(lambda x: x[0]+" "+x[1],axis=1)
df1["datetime"] = pd.to_datetime(df1["datetime"], infer_datetime_format=True)
df1 = df1.drop(["Date","Time"],axis=1)
df1.index = df1["datetime"]
df1 = df1.drop("datetime",axis=1)
df1 = df1.apply(lambda x: pd.to_numeric(x, errors='coerce'))

### If you look at the dataset, you will realise that the original sample frequency is 1 minute
- To simplify the dataset, we resample this to 30 Mins interval
- We also use this opportunity to normalise the dataset using MinMaxScaler

In [None]:
df1

In [None]:
df1 = df1.interpolate()
df2 = df1.resample('30T').mean()
df2 = df2.interpolate()
df2_data = df2.values


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df2_data)
df2_data = scaler.transform(df2_data)

for i,name in enumerate(df2.columns):
    df2[name] = df2_data[:,i]

### We can then split the data into training and testing set
- Shuffle is False to preserve the sequential information (If it is set to True, the rows will be scrambled and the temporal information will be destroyed)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df2.iloc[:,:-3].values, df2.iloc[:,-3:].values, test_size=0.05,shuffle=False)

### We Convert the training data to tf.float32
- We are going to train the GRU and Transformer models with the tf.GradientTape method
- casting the data to tensorflow datatype is therefore required

In [None]:
data = tf.cast(X_train,tf.float32)
targets = tf.cast(y_train,tf.float32)

### Converting to rolling window dataset
- 24 historical samples of the first 4 columns (Global_active_power	Global_reactive_power	Voltage	Global_intensity) will predict the future 6 samples of the last 3 columns (Sub_metering_1	Sub_metering_2	Sub_metering_3)
- the method "timeseries_dataset_from_array" is copied from https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/timeseries_dataset_from_array
- I placed it in a seperate file because I am using tensorflow 2.0 in my personal computer and "timeseries_dataset_from_array" is not present in older versions

In [None]:
sample_length = 24
input_dataset = timeseries_dataset_from_array(data,None, sequence_length=sample_length,batch_size=256, sequence_stride=sample_length)
target_dataset = timeseries_dataset_from_array(targets, None, sequence_length=6,batch_size=256, sequence_stride=sample_length)

### Defining the transformer model
- The transformer model is neatly imported and initiated as shown below
- This transformer model is modified from https://www.tensorflow.org/text/tutorials/transformer#create_the_transformer
- The original version is made for processing NLP tokens
- I replaced the Embedding layers (Mean for discrete sequential data) with TimeDistributed Layers to process continuous variables instead
- Here I defined the model so that the number of parameters closely match the GRU model below
- Instead of going deeper by using 4 layers as shown below, you can go wider by using 2 layers and a larger dff, head, d_model size

In [None]:
from transformers import Transformer
xformer = Transformer(3,num_layers=4,num_heads=4,dff=256,d_model=32)
pred = xformer(tf.random.normal((1,24,4)),tf.random.normal((1,6,3)),False)
# xformer.load_weights("xformer.h5")
print(pred.shape)
xformer.summary()

- ### Because the number of params is similar, this should be a close fight!
- Im trying to measure the efficiency of the models instead

In [None]:
from tensorflow.keras.layers import GRU,Dense,Input,TimeDistributed,RepeatVector
from tensorflow.keras.models import Model

in1 = Input((24,4))
gru1 = GRU(128)(in1)
gru2 = RepeatVector(6)(gru1)
gru2 = GRU(128,return_sequences=True)(gru2)
gru2 = TimeDistributed(Dense(3))(gru2)

model = Model(in1,gru2)
model.summary()

- Custom schedule as reccomended by https://www.tensorflow.org/text/tutorials/transformer#create_the_transformer

In [None]:
d_model = 32
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)
optimizer2 = tf.keras.optimizers.Adam(beta_1=0.9, beta_2=0.98,epsilon=1e-9)

In [None]:
len_data = sum(1 for _ in input_dataset)

### We train for 200 iterations
- In the original implementation the transformer model requires a "start" token
- Here this "Start" token is -0.1

In [None]:
for e in range(200):
    for i,batch in enumerate(zip(input_dataset, target_dataset)):

        if i<len_data-1:
            in1, tar = batch
            
            tar_inp = tf.ones((256,1,3))*-0.1
            tar_inp = tf.concat([tar_inp,tar[:, :-1]],axis=1)
            tar_real = tar[:, 1:]

            with tf.GradientTape() as tape:
                predictions = xformer(in1, tar_inp,True)
                loss = tf.reduce_mean(tf.square(tar_real-predictions[:,:-1]))
            gradients = tape.gradient(loss, xformer.trainable_variables)
            optimizer.apply_gradients(zip(gradients, xformer.trainable_variables))
            
            with tf.GradientTape() as tape:
                pred2 = model(in1)
                loss2 = tf.reduce_mean(tf.square(tar-pred2))
            gradients = tape.gradient(loss2, model.trainable_variables)
            optimizer2.apply_gradients(zip(gradients, model.trainable_variables))
                
    print(f"epoch {e}| Transformer:{loss.numpy()} | GRU:{loss2.numpy()}")

- The code below can be used after training to predict the output of a transformer
- notice that each element of the output is iteratively concatenated
- the GRU model does not require this step

In [None]:
def predict(inp):
    
    inp = inp.reshape(-1,24,4)
#     inp = tf.reshape(inp,(-1,24,4))
    n_batch = inp.shape[0]
    start = np.ones((n_batch,1,3))*-0.1
    
    for i in range(6):
        
        pred = xformer(inp,start,False)
        pred = np.expand_dims(pred[:,-1,:],0)
        start = np.concatenate([start,pred],axis=1)
#         print(start.shape)
        
    return start[:,1:]

In [None]:
x_test = timeseries_dataset_from_array(X_test,None, sequence_length=sample_length,batch_size=1, sequence_stride=sample_length)
y_test = timeseries_dataset_from_array(y_test, None, sequence_length=6,batch_size=1, sequence_stride=sample_length)
x_test = list(x_test.as_numpy_iterator())
y_test = list(y_test.as_numpy_iterator())

In [None]:
x_test2 = np.asarray(x_test).reshape(-1,24,4)
y_test2 = np.asarray(y_test).reshape(-1,6,3)

In [None]:
preds_xform = [predict(x) for x in x_test2]

### Finally, we test both models for their MSE on the Test Set
- Both models behave similarly in terms of MSE with the transformer being slightly better with 2 transformer layers (test this out by yourself!)
- In my experiments shown in this notebook, 4 transformer layers (Deeper) will destroy GRUs lol

- I wouldnt use transformers for very long sequences (transformer has a o(n^2) computational and memory complexity)

In [None]:
err_xform = np.mean([(y_true-y_pred)**2 for y_true,y_pred in zip(y_test2,preds_xform)])

pred_gru = model(x_test2)
err_gru = np.mean((pred_gru-y_test2)**2)

err_xform,err_gru