In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta
import os

import math
import tensorflow as tf
import random

import keras
from keras.layers import LSTM, Dense

from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping 
from scikeras.wrappers import KerasRegressor
from keras.models import load_model
from sklearn.model_selection import KFold

random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)
import gc
import pickle

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from sklearn.ensemble import RandomForestRegressor

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

def CVRMSE(y_pred, y_true):
    y_pred, y_true = np.array(y_pred), np.array(y_true)
    return mean_squared_error(y_true,y_pred, squared=False)/np.mean(y_true)*100

# Stack data 

In [2]:
# Read the kc dates and its values
fechas_kc = pd.read_excel('./kc_2023.xlsx', usecols='B:D')
fechas_kc.dropna(inplace=True)
fechas_kc

Unnamed: 0,Start,End,Kc
0,2023-04-10,2023-04-16,0.55
1,2023-04-17,2023-04-23,0.55
2,2023-04-24,2023-04-30,0.55
3,2023-05-01,2023-05-07,0.55
4,2023-05-08,2023-05-14,0.55
5,2023-05-15,2023-05-21,0.55
6,2023-05-22,2023-05-28,0.55
7,2023-05-29,2023-06-04,0.55
8,2023-06-05,2023-06-11,0.58
9,2023-06-12,2023-06-18,0.62


In [3]:
# Find the corresponding kc value of the date x
def getKc(x):
    for i in range(0,len(fechas_kc)):
        if (x >= fechas_kc['Start'][i]) & (x < fechas_kc['End'][i] + timedelta(days=1)):
            return fechas_kc['Kc'][i]

In [4]:
plots = os.listdir('./datos 2023/')
all_plots = []

# Read plots one by one and save them in a array
for p in plots:
    # Read data
    df_fp = pd.read_excel('./datos 2023/' + p, usecols='E,F,J,N,R,W, Y:AE')
    df_fp['DateTime'] = pd.to_datetime(df_fp['DateTime'])

    # Use the hour as one of input variables (applying sin to avoid jump from 23 to 00)
    df_fp['hour_sin'] = df_fp['DateTime'].apply(lambda x: math.sin( (2 * math.pi * x.hour) / 24) )
    df_fp.set_index('DateTime',inplace=True)

    # Resample the FP values
    df_fp = df_fp.resample('1h').mean() # Resampling de 1h
    df_fp.columns = ['FP', 'HR5_25', 'HR35_55', 'HR65_85', 'Riego', 'TMED', 'PREC', 'HR', 'RAD', 'DPV', 'VV', 'ETO', 'Hour_sin']
    
    # Use only HR_35_55 and data from 2023-6-1
    df_fp.drop(columns=['HR5_25', 'HR65_85', ], inplace=True)
    df_fp = df_fp[df_fp.index >= '2023-6-1'].copy()
    df_fp = df_fp[df_fp.index < '2023-10-1'].copy()

    # Set the kc value for each date (row) using the function getKc()
    kcs = df_fp.reset_index()['DateTime'].apply(lambda x: getKc(x))
    df_fp['Kc'] = kcs.values

    # Categorical input variable (name of the plot)
    df_fp['ID'] = p.split('_')[0]

    df_fp.dropna(inplace=True)

    
    if '4.2' in p.split('_')[0]:
        df_fp = df_fp[df_fp.index >= '2023-07-26 12:00:00']
    
    all_plots.append(df_fp)
all_plots

[                            FP    HR35_55     Riego       TMED  PREC   
 DateTime                                                               
 2023-07-21 09:00:00  -3.645243  62.354863  0.000000  26.946250   0.0  \
 2023-07-21 10:00:00  -3.614804  59.446942  1.554878  29.925000   0.0   
 2023-07-21 11:00:00  -2.526165  58.095185  0.762195  31.916250   0.0   
 2023-07-21 12:00:00  -3.653391  57.229654  0.000000  33.187500   0.0   
 2023-07-21 13:00:00  -8.117500  56.622889  0.000000  34.270000   0.0   
 ...                        ...        ...       ...        ...   ...   
 2023-09-30 19:00:00 -14.142137  44.335860  0.000000  25.678750   0.0   
 2023-09-30 20:00:00 -12.522581  44.345736  0.000000  22.625000   0.0   
 2023-09-30 21:00:00 -10.869316  44.353496  0.000000  17.862917   0.0   
 2023-09-30 22:00:00 -10.028503  44.363735  0.000000  16.011250   0.0   
 2023-09-30 23:00:00  -9.456029  44.379830  0.000000  15.433333   0.0   
 
                             HR        RAD       

In [5]:
df_all_plots = pd.concat(all_plots)
numerical_features = ['HR35_55', 'Riego', 'TMED', 'PREC', 'HR', 'RAD', 'DPV', 'VV', 'ETO', 'Hour_sin', 'Kc']
scaler_x = MinMaxScaler().fit(df_all_plots[numerical_features])
scaler_y = MinMaxScaler().fit(df_all_plots[['FP']])

In [25]:
fig = go.Figure()
for p in all_plots:
    fig.add_trace(go.Scatter(x=p.index, y=p['FP']*0.1,
                    name=p['ID'][0], mode='lines'))
fig.show()

In [7]:
train = pd.DataFrame()
test = pd.DataFrame()
for p in all_plots:
    train = pd.concat([train, p.iloc[:int(len(p)*0.7),:]])
    test = pd.concat([test, p.iloc[int(len(p)*0.7):,:]])
    print(p['ID'][0] + ':', len(p), 'train:', len(train), ', test:', len(test))


T1.1.: 1719 train: 1203 , test: 516
T1.2.: 2928 train: 3252 , test: 1395
T2.1.: 2928 train: 5301 , test: 2274
T2.2.: 2928 train: 7350 , test: 3153
T3.1.: 2928 train: 9399 , test: 4032
T4.1.: 2928 train: 11448 , test: 4911
T4.2.: 1596 train: 12565 , test: 5390


In [8]:
train

Unnamed: 0_level_0,FP,HR35_55,Riego,TMED,PREC,HR,RAD,DPV,VV,ETO,Hour_sin,Kc,ID
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-07-21 09:00:00,-3.645243,62.354863,0.000000,26.946250,0.0,30.140417,527.15625,2.497250,1.173625,0.396445,7.071068e-01,0.68,T1.1.
2023-07-21 10:00:00,-3.614804,59.446942,1.554878,29.925000,0.0,25.113000,734.73850,3.183800,1.044950,0.553319,5.000000e-01,0.68,T1.1.
2023-07-21 11:00:00,-2.526165,58.095185,0.762195,31.916250,0.0,21.367500,863.03250,3.729250,1.068750,0.659422,2.588190e-01,0.68,T1.1.
2023-07-21 12:00:00,-3.653391,57.229654,0.000000,33.187500,0.0,18.329643,949.59750,4.158536,1.264286,0.739059,1.224647e-16,0.68,T1.1.
2023-07-21 13:00:00,-8.117500,56.622889,0.000000,34.270000,0.0,15.195000,988.50000,4.580750,1.341000,0.779942,-2.588190e-01,0.68,T1.1.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-10 20:00:00,-12.285186,70.488051,0.000000,24.480000,0.0,53.927500,2.61250,1.438125,0.908125,0.060283,-8.660254e-01,0.49,T4.2.
2023-09-10 21:00:00,-11.066156,70.401948,0.000000,22.856250,0.0,65.673750,0.00000,0.963750,0.849750,0.036168,-7.071068e-01,0.49,T4.2.
2023-09-10 22:00:00,-10.182823,70.347922,0.000000,21.865000,0.0,70.256250,0.00000,0.780750,1.022750,0.036593,-5.000000e-01,0.49,T4.2.
2023-09-10 23:00:00,-9.456375,70.254918,0.000000,20.642917,0.0,71.587917,0.00000,0.694208,0.645583,0.027003,-2.588190e-01,0.49,T4.2.


In [9]:
test

Unnamed: 0_level_0,FP,HR35_55,Riego,TMED,PREC,HR,RAD,DPV,VV,ETO,Hour_sin,Kc,ID
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-09-09 12:00:00,-13.954909,47.867593,3.636760,26.102857,0.0,49.861429,818.484286,1.705857,1.303286,0.569368,1.224647e-16,0.49,T1.1.
2023-09-09 13:00:00,-14.827903,48.830717,0.514482,27.418750,0.0,45.186250,850.793750,2.006125,1.619125,0.607774,-2.588190e-01,0.49,T1.1.
2023-09-09 14:00:00,-16.106366,49.292000,0.000000,28.536000,0.0,41.448000,822.189000,2.293700,1.877200,0.605895,-5.000000e-01,0.49,T1.1.
2023-09-09 15:00:00,-16.158395,49.587850,0.000000,30.018333,0.0,33.210417,712.335000,2.842625,1.984250,0.552530,-7.071068e-01,0.49,T1.1.
2023-09-09 16:00:00,-16.440883,49.737060,0.000000,30.351250,0.0,30.151250,593.298750,3.025250,1.939000,0.472046,-8.660254e-01,0.49,T1.1.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30 19:00:00,-15.735960,68.474847,0.000000,25.813214,0.0,33.000357,24.374286,2.240357,0.841714,0.095524,-9.659258e-01,0.46,T4.2.
2023-09-30 20:00:00,-14.322500,68.563167,0.000000,22.253750,0.0,42.021250,0.000000,1.610250,0.587625,0.044934,-8.660254e-01,0.46,T4.2.
2023-09-30 21:00:00,-12.811929,68.643967,0.000000,17.947143,0.0,56.278571,0.000000,0.918143,0.579429,0.033200,-7.071068e-01,0.46,T4.2.
2023-09-30 22:00:00,-11.750043,68.675542,0.000000,16.011250,0.0,63.022500,0.000000,0.675750,0.489875,0.026003,-5.000000e-01,0.46,T4.2.


In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode Observation_ID
encoder = LabelEncoder()
train['ID_encoded'] = encoder.fit_transform(train['ID'])
test['ID_encoded'] = encoder.fit_transform(test['ID'])
train

Unnamed: 0_level_0,FP,HR35_55,Riego,TMED,PREC,HR,RAD,DPV,VV,ETO,Hour_sin,Kc,ID,ID_encoded
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-07-21 09:00:00,-3.645243,62.354863,0.000000,26.946250,0.0,30.140417,527.15625,2.497250,1.173625,0.396445,7.071068e-01,0.68,T1.1.,0
2023-07-21 10:00:00,-3.614804,59.446942,1.554878,29.925000,0.0,25.113000,734.73850,3.183800,1.044950,0.553319,5.000000e-01,0.68,T1.1.,0
2023-07-21 11:00:00,-2.526165,58.095185,0.762195,31.916250,0.0,21.367500,863.03250,3.729250,1.068750,0.659422,2.588190e-01,0.68,T1.1.,0
2023-07-21 12:00:00,-3.653391,57.229654,0.000000,33.187500,0.0,18.329643,949.59750,4.158536,1.264286,0.739059,1.224647e-16,0.68,T1.1.,0
2023-07-21 13:00:00,-8.117500,56.622889,0.000000,34.270000,0.0,15.195000,988.50000,4.580750,1.341000,0.779942,-2.588190e-01,0.68,T1.1.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-10 20:00:00,-12.285186,70.488051,0.000000,24.480000,0.0,53.927500,2.61250,1.438125,0.908125,0.060283,-8.660254e-01,0.49,T4.2.,6
2023-09-10 21:00:00,-11.066156,70.401948,0.000000,22.856250,0.0,65.673750,0.00000,0.963750,0.849750,0.036168,-7.071068e-01,0.49,T4.2.,6
2023-09-10 22:00:00,-10.182823,70.347922,0.000000,21.865000,0.0,70.256250,0.00000,0.780750,1.022750,0.036593,-5.000000e-01,0.49,T4.2.,6
2023-09-10 23:00:00,-9.456375,70.254918,0.000000,20.642917,0.0,71.587917,0.00000,0.694208,0.645583,0.027003,-2.588190e-01,0.49,T4.2.,6


In [11]:
train_scaled = train.copy()
train_scaled[numerical_features] = scaler_x.transform(train[numerical_features])
train_scaled['FP'] = scaler_y.transform(train[['FP']])
train_scaled

Unnamed: 0_level_0,FP,HR35_55,Riego,TMED,PREC,HR,RAD,DPV,VV,ETO,Hour_sin,Kc,ID,ID_encoded
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-07-21 09:00:00,0.897742,0.585796,0.000000,0.510532,0.0,0.250270,0.504765,0.322311,0.158033,0.465367,0.853553,1.000000,T1.1.,0
2023-07-21 10:00:00,0.898768,0.528868,0.258228,0.601499,0.0,0.194319,0.703530,0.412610,0.140469,0.654406,0.750000,1.000000,T1.1.,0
2023-07-21 11:00:00,0.935445,0.502404,0.126582,0.662310,0.0,0.152634,0.826375,0.484351,0.143718,0.782263,0.629410,1.000000,T1.1.,0
2023-07-21 12:00:00,0.897468,0.485460,0.000000,0.701132,0.0,0.118825,0.909263,0.540813,0.170408,0.878227,0.500000,1.000000,T1.1.,0
2023-07-21 13:00:00,0.747069,0.473581,0.000000,0.734190,0.0,0.083938,0.946513,0.596346,0.180879,0.927492,0.370590,1.000000,T1.1.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-10 20:00:00,0.606657,0.745018,0.000000,0.435216,0.0,0.515004,0.002502,0.183008,0.121793,0.060282,0.066987,0.136364,T4.2.,6
2023-09-10 21:00:00,0.647727,0.743333,0.000000,0.385628,0.0,0.645732,0.000000,0.120615,0.113825,0.031224,0.146447,0.136364,T4.2.,6
2023-09-10 22:00:00,0.677487,0.742275,0.000000,0.355357,0.0,0.696732,0.000000,0.096546,0.137439,0.031735,0.250000,0.136364,T4.2.,6
2023-09-10 23:00:00,0.701961,0.740454,0.000000,0.318036,0.0,0.711553,0.000000,0.085163,0.085957,0.020179,0.370590,0.136364,T4.2.,6


In [12]:
test_scaled = test.copy()
test_scaled[numerical_features] = scaler_x.transform(test[numerical_features])
test_scaled.dropna(inplace=True)
test_scaled

Unnamed: 0_level_0,FP,HR35_55,Riego,TMED,PREC,HR,RAD,DPV,VV,ETO,Hour_sin,Kc,ID,ID_encoded
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-09-09 12:00:00,-13.954909,0.302180,0.603978,0.484776,0.0,0.469752,0.783719,0.218222,0.175732,0.673745,0.500000,0.136364,T1.1.,0
2023-09-09 13:00:00,-14.827903,0.321035,0.085443,0.524962,0.0,0.417720,0.814656,0.257715,0.218843,0.720025,0.370590,0.136364,T1.1.,0
2023-09-09 14:00:00,-16.106366,0.330065,0.000000,0.559081,0.0,0.376116,0.787266,0.295539,0.254069,0.717761,0.250000,0.136364,T1.1.,0
2023-09-09 15:00:00,-16.158395,0.335857,0.000000,0.604350,0.0,0.284437,0.682078,0.367737,0.268681,0.653454,0.146447,0.136364,T1.1.,0
2023-09-09 16:00:00,-16.440883,0.338778,0.000000,0.614517,0.0,0.250391,0.568098,0.391757,0.262505,0.556469,0.066987,0.136364,T1.1.,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30 19:00:00,-15.735960,0.705606,0.000000,0.475931,0.0,0.282100,0.023339,0.288523,0.112728,0.102749,0.017037,0.000000,T4.2.,6
2023-09-30 20:00:00,-14.322500,0.707335,0.000000,0.367229,0.0,0.382496,0.000000,0.205647,0.078046,0.041786,0.066987,0.000000,T4.2.,6
2023-09-30 21:00:00,-12.811929,0.708917,0.000000,0.235710,0.0,0.541170,0.000000,0.114617,0.076927,0.027647,0.146447,0.000000,T4.2.,6
2023-09-30 22:00:00,-11.750043,0.709535,0.000000,0.176590,0.0,0.616225,0.000000,0.082736,0.064703,0.018974,0.250000,0.000000,T4.2.,6


In [13]:

# Assuming a sequence length of 12
sequence_length = 12
sequences = []
for obs_id in train_scaled['ID'].unique():
    obs_data = train_scaled[train_scaled['ID'] == obs_id]
    for i in range(len(obs_data) - sequence_length + 1):
        seq = obs_data.iloc[i:i + sequence_length]
        sequences.append(seq)

sequences

[                           FP   HR35_55     Riego      TMED  PREC        HR   
 DateTime                                                                      
 2023-07-21 09:00:00  0.897742  0.585796  0.000000  0.510532   0.0  0.250270  \
 2023-07-21 10:00:00  0.898768  0.528868  0.258228  0.601499   0.0  0.194319   
 2023-07-21 11:00:00  0.935445  0.502404  0.126582  0.662310   0.0  0.152634   
 2023-07-21 12:00:00  0.897468  0.485460  0.000000  0.701132   0.0  0.118825   
 2023-07-21 13:00:00  0.747069  0.473581  0.000000  0.734190   0.0  0.083938   
 2023-07-21 14:00:00  0.696243  0.463268  0.000000  0.755262   0.0  0.069832   
 2023-07-21 15:00:00  0.671406  0.450686  0.000000  0.780037   0.0  0.061207   
 2023-07-21 16:00:00  0.663177  0.437766  0.000000  0.776105   0.0  0.100117   
 2023-07-21 17:00:00  0.657693  0.420762  0.000000  0.729173   0.0  0.225416   
 2023-07-21 18:00:00  0.674666  0.405462  0.000000  0.678228   0.0  0.333930   
 2023-07-21 19:00:00  0.707853  0.392006

In [14]:
X_train = np.array([seq.drop(columns=['FP','ID']).values for seq in sequences])
y_train = np.array([seq['FP'].values[-1] for seq in sequences])

In [15]:
# Split data
X_features = X_train[:, :, :-1]  # numerical Features
X_obs_ids = X_train[:, :, -1]      # Observation_ID_encoded

## hyperparameter tuning

In [16]:
from keras_tuner import HyperModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate

class LSTMHyperModel(HyperModel):
    def build(self, hp):
        # Inputs
        numerical_input = Input(shape=(sequence_length, len(numerical_features)), name='Numerical_Input')
        categorical_input = Input(shape=(sequence_length,), name='Categorical_Input')
        
        # Embedding layer for categorical feature
        embedding_dim = hp.Int('embedding_dim', min_value=4, max_value=20, step=4)  # Tune embedding dimension
        num_observations = train_scaled['ID_encoded'].nunique()
        categorical_embedding = Embedding(input_dim=num_observations, 
                                          output_dim=embedding_dim, 
                                          input_length=sequence_length)(categorical_input)
        
        # Concatenate inputs
        concatenated = Concatenate()([numerical_input, categorical_embedding])
        
        # LSTM layers
        lstm_units_1 = hp.Int('lstm_units_1', min_value=16, max_value=128, step=8)  # Tune units in 1st LSTM
        lstm_out_1 = LSTM(lstm_units_1, activation="relu", return_sequences=True)(concatenated)

        lstm_units_2 = hp.Int('lstm_units_2', min_value=8, max_value=64, step=4)  # Tune units in 2nd LSTM
        lstm_out_2 = LSTM(lstm_units_2, activation="relu", return_sequences=False)(lstm_out_1)
        

        # Dense layer for final output
        output = Dense(1, activation='linear')(lstm_out_2)
        
        # Model
        model = Model(inputs=[numerical_input, categorical_input], outputs=output)
        
        # Compile
        learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')  # Tune learning rate
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), 
                      loss='mse', 
                      metrics=['mae'])
        
        return model

Using TensorFlow backend


In [17]:
X_features_train, X_features_val, X_obs_ids_train, X_obs_ids_val, y_train_train, y_train_val = train_test_split(X_features, X_obs_ids, y_train, test_size=0.2, shuffle=True, random_state=123)

In [18]:
from keras_tuner import RandomSearch, BayesianOptimization

# Initialize the tuner
tuner = BayesianOptimization(
    LSTMHyperModel(),
    objective='val_loss',  # Optimize for validation loss
    max_trials=20,         # Number of hyperparameter combinations to try
    executions_per_trial=1,  # Average results over 2 runs for each configuration
    directory='tuner_logs',  # Directory to save tuning logs
    project_name='lstm_hyperparameter_tuning_bayesian_noLags',
    seed=1234
)

tuner.search(
    x=[X_features_train, X_obs_ids_train],
    y=y_train_train,
    validation_data=([X_features_val, X_obs_ids_val], y_train_val),
    epochs=70,
    batch_size=32,
    #callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]  # Early stopping
)

Reloading Tuner from tuner_logs\lstm_hyperparameter_tuning_bayesian_noLags\tuner0.json


In [19]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Best hyperparameters:
- Embedding dimension: {best_hps.get('embedding_dim')}
- LSTM units (1st layer): {best_hps.get('lstm_units_1')}
- LSTM units (2nd layer): {best_hps.get('lstm_units_2')}
- Learning rate: {best_hps.get('learning_rate')}
""")


Best hyperparameters:
- Embedding dimension: 12
- LSTM units (1st layer): 128
- LSTM units (2nd layer): 64
- Learning rate: 0.001468734101447334



In [20]:
lstm_model = tuner.get_best_models()[0]
lstm_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Categorical_Input (InputLa  [(None, 12)]                 0         []                            
 yer)                                                                                             
                                                                                                  
 Numerical_Input (InputLaye  [(None, 12, 11)]             0         []                            
 r)                                                                                               
                                                                                                  
 embedding (Embedding)       (None, 12, 12)               84        ['Categorical_Input[0][0]']   
                                                                                              

In [None]:
r2s = []
maes = []
cvrmses = []
for id in test_scaled['ID'].unique():

    # Get df of corresponding plot
    df_plot = test_scaled[test_scaled['ID'] == id]

    # create sequences
    sequences = []
    for obs_id in df_plot['ID'].unique():
        obs_data = df_plot[df_plot['ID'] == obs_id]
        for i in range(len(obs_data) - sequence_length + 1):
            seq = obs_data.iloc[i:i + sequence_length]
            sequences.append(seq)

    # Create input/output 
    X_test = np.array([seq.drop(columns=['FP','ID']).values for seq in sequences])
    y_test = np.array([seq['FP'].values[-1] for seq in sequences])
    
    # Split numerical/categorical input
    X_features = X_test[:, :, :-1]  # numerical Features
    X_obs_ids = X_test[:, :, -1]      # Observation_ID_encoded

    # Make predictions on the test set
    y_pred = lstm_model.predict([X_features, X_obs_ids])
    y_pred = scaler_y.inverse_transform(y_pred).ravel()
    
    # Convert the units bars to MPa
    y_pred = y_pred * 0.1
    y_test = y_test * 0.1

    # Metrics
    print('Plot:', id)
    r2 = np.corrcoef(y_test, y_pred)[0][1]**2
    r2s.append(r2)
    print('R2:', r2)
    mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
    maes.append(mae)
    print('MAE:', mae)
    cvrmse = CVRMSE(y_true=y_test,y_pred=y_pred)
    cvrmses.append(cvrmse)
    print('CVRMSE:', cvrmse)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_plot.index, y=y_test,
                        name='Real', mode='lines'))
    fig.add_trace(go.Scatter(x=df_plot.index, y=y_pred,
                        name='Prediction', mode='lines'))
    fig.update_layout(
    xaxis=dict(
            title_text="Date",
        ),
    yaxis=dict(
            title_text="TWP (MPa)",
        )
    )
    fig.show()
    pio.write_image(fig, './images/LSTM-SinLags-'+id + '-pred_real.png',scale=3, width=900, height=400)

Plot: T1.1.
R2: 0.6122810790779061
MAE: 0.21131259304895594
CVRMSE: -23.174819021389723


Plot: T1.2.
R2: 0.7130239269972859
MAE: 0.1576997584031481
CVRMSE: -15.170354534862568


Plot: T2.1.
R2: 0.7475167400245447
MAE: 0.19259164302383883
CVRMSE: -15.673346098743245


Plot: T2.2.
R2: 0.7681874486339526
MAE: 0.20258548764410958
CVRMSE: -18.66044065488618


Plot: T3.1.
R2: 0.774559628476187
MAE: 0.11898100203707547
CVRMSE: -13.314193185673917


Plot: T4.1.
R2: 0.8047298776218753
MAE: 0.29710254651526985
CVRMSE: -31.677892991329248


Plot: T4.2.
R2: 0.86086521999382
MAE: 0.15467558441018914
CVRMSE: -16.882839789468836


In [24]:
# Overall metrics
print('R2:', np.mean(r2s))
print('MAE:',np.mean(maes))
print('CVRMSE:',np.mean(cvrmses))

R2: 0.7544519886893674
MAE: 0.19070694501179813
CVRMSE: -19.221983753764817
