# RNN : LSTM


In [None]:
# import
import pickle 

import numpy as np

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error


# reproductibility
np.random.seed(42)

In [14]:
## ICI ON RÈGLE QUELLE TAILLE DE FENETRE ON VEUT UTILISER (POUR NE PAS DUPLIQUER LE CODE)
taille_fenetre_to_run = "courte"
assert taille_fenetre_to_run in ["courte", "moyenne", "longue"]

In [15]:
if taille_fenetre_to_run == "courte":
    data = pickle.load(open("Data/donnees_courte.pkl", "rb"))
elif taille_fenetre_to_run == "moyenne":
    data = pickle.load(open("Data/donnees_moyenne.pkl", "rb"))
else:
    data = pickle.load(open("Data/donnees_longue.pkl", "rb"))

In [16]:
data.keys()

dict_keys(['X_np_label', 'X_np_binary', 'y_np', 'X_df_label', 'X_df_binary', 'y_df'])

In [17]:
X_np_binary = data["X_np_binary"]
y_np = data["y_np"]

In [19]:
# Vérifier que toutes les données de X_np_binary sont entre 0 et 1
for i in range(X_np_binary.shape[0]):
    for j in range(X_np_binary.shape[1]):
        for k in range(X_np_binary.shape[2]):
            assert X_np_binary[i, j, k] >= 0 and X_np_binary[i, j, k] <= 1

In [6]:
def create_lstm(input_shape, units=100, dropout_rate=0.2, activation = 'tanh', learning_rate = 0.001):

    # pour ajouter des couches 
    model = Sequential()

    # units : 100, Plus ce nombre est élevé, plus le modèle peut capturer de relations complexes dans les données, mais cela augmente aussi le coût computationnel.
    # activation : tanh, fonction d'activation tanh (classique dans les LSTM)
    model.add(LSTM(units, input_shape=input_shape, activation=activation))

    # éviter surapprentissage
    model.add(Dropout(dropout_rate)) 

    # output pour un problème de régresssion 
    model.add(Dense(1)) 

    # optimizer adam 
    # mse : typique pour un problème de régression
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse') # mse pour un problème de régression ?

    return model

In [7]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [8]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

mse_scores = []
mae_scores = []
mape_scores = []
rmse_scores = []

input_shape = (X_np_binary.shape[1], X_np_binary.shape[2])

for fold, (train_index, test_index) in enumerate(kf.split(X_np_binary)):
    print(f"Running fold {fold+1}/{n_splits}")

    X_train, X_test = X_np_binary[train_index], X_np_binary[test_index]
    y_train, y_test = y_np[train_index], y_np[test_index]

    model = create_lstm(input_shape)

    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=1)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    mse_scores.append(mse)
    mae_scores.append(mae)
    mape_scores.append(mape)
    rmse_scores.append(rmse)

    model.save(f"Models/lstm_{taille_fenetre_to_run}_fold_{fold}.h5")

Running fold 1/5
Epoch 1/100


  super().__init__(**kwargs)


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 7782.5713 - val_loss: 5624.9023
Epoch 2/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 5164.4272 - val_loss: 4255.4058
Epoch 3/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 3894.7910 - val_loss: 3202.9841
Epoch 4/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 2914.9492 - val_loss: 2376.8118
Epoch 5/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2153.5349 - val_loss: 1734.1029
Epoch 6/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1576.5184 - val_loss: 1241.5673
Epoch 7/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1107.2367 - val_loss: 872.1495
Epoch 8/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 774.7626 - val_loss: 600.3729
Epoch 9



Running fold 2/5
Epoch 1/100


  super().__init__(**kwargs)


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 7705.2129 - val_loss: 5599.2354
Epoch 2/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 5156.5781 - val_loss: 4235.5513
Epoch 3/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 3871.4385 - val_loss: 3192.1313
Epoch 4/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2918.1467 - val_loss: 2373.0215
Epoch 5/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 2155.5688 - val_loss: 1734.9990
Epoch 6/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1572.1409 - val_loss: 1245.6970
Epoch 7/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1125.5126 - val_loss: 876.8120
Epoch 8/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 792.0654 - val_loss: 605.0386
Epoch 9



Running fold 3/5
Epoch 1/100


  super().__init__(**kwargs)


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 7686.6807 - val_loss: 5495.7354
Epoch 2/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 5044.0293 - val_loss: 4144.5303
Epoch 3/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 3788.7913 - val_loss: 3102.5352
Epoch 4/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 2825.9617 - val_loss: 2290.8752
Epoch 5/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 2072.8669 - val_loss: 1662.6162
Epoch 6/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1505.1759 - val_loss: 1184.4567
Epoch 7/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 1066.5900 - val_loss: 826.9332
Epoch 8/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 737.5116 - val_loss: 566.1404
Epoch 9



Running fold 4/5
Epoch 1/100


  super().__init__(**kwargs)


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 7659.5854 - val_loss: 5606.9321
Epoch 2/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 5146.4556 - val_loss: 4265.4053
Epoch 3/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 3916.0095 - val_loss: 3216.8020
Epoch 4/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 2912.5278 - val_loss: 2394.8284
Epoch 5/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 2172.0869 - val_loss: 1753.6310
Epoch 6/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1584.5419 - val_loss: 1260.7627
Epoch 7/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 1137.1929 - val_loss: 889.4667
Epoch 8/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 788.7096 - val_loss: 615.5594
Epoch 9



Running fold 5/5
Epoch 1/100


  super().__init__(**kwargs)


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 7663.4536 - val_loss: 5364.7827
Epoch 2/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 4924.0342 - val_loss: 4034.5632
Epoch 3/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 3701.8228 - val_loss: 3015.0259
Epoch 4/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 2769.1335 - val_loss: 2219.7212
Epoch 5/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 2008.9950 - val_loss: 1606.1367
Epoch 6/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1459.2872 - val_loss: 1139.1721
Epoch 7/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1028.5632 - val_loss: 792.1630
Epoch 8/100
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 712.6514 - val_loss: 539.8787
Epoch 9



In [9]:
print(f"Mean MSE: {np.mean(mse_scores)}")
print(f"Mean MAE: {np.mean(mae_scores)}")
print(f"Mean MAPE: {np.mean(mape_scores)}")
print(f"Mean RMSE: {np.mean(rmse_scores)}")

Mean MSE: 17.42037830836697
Mean MAE: 2.764274539932343
Mean MAPE: 0.02997391833519527
Mean RMSE: 3.5595542960126956


Le MSE mesure l'erreur quadratique moyenne entre les prédictions et les vraies valeurs. Plus cette valeur est faible, mieux c'est. Ici, une moyenne de 17.42 semble élevée.

Le MAE mesure l'erreur absolue moyenne, ce qui est plus interprétable que le MSE. Une erreur moyenne d'environ 2.76 indique que les prédictions diffèrent en moyenne de 2.76 unités des vraies valeurs. (donc 2.76 (%) de SOH)

Le MAPE est une mesure relative exprimée en pourcentage. Une erreur moyenne de 3 % est raisonnable.

Le RMSE est la racine carrée du MSE et est plus sensible aux grandes erreurs. Une valeur moyenne de 3.56 peut être acceptable.

In [10]:
print(f'MSE scores: {mse_scores}')
print(f'MAE scores: {mae_scores}')
print(f'MAPE scores: {mape_scores}')
print(f'RMSE scores: {rmse_scores}')

MSE scores: [26.695000867637678, 28.15828992730544, 30.620667568082734, 0.6175233503730538, 1.0104098284359413]
MAE scores: [4.07977892255286, 4.1658830077216065, 4.214898347605092, 0.6006953167391171, 0.760117105043041]
MAPE scores: [0.04408459509317971, 0.04519266061626506, 0.04605803489568272, 0.00642714095196455, 0.0081071601188843]
RMSE scores: [np.float64(5.166720513791866), np.float64(5.306438535148168), np.float64(5.53359445280215), np.float64(0.7858265396212155), np.float64(1.0051914387000822)]


Les scores sont très variables entre les différentes itérations de validation croisée :
- Les deux derniers folds ont des scores plus faibles : donc meilleure performance

Les faibles scores dans certaines itérations montrent que votre modèle a très bien fonctionné sur ces splits. Cependant, les scores élevés dans d'autres itérations indiquent un surapprentissage ou une distribution très variable des données entre les splits.

In [11]:
# scores = []

# param_grid = {
#     'model__units': [50, 100, 200],
#     'dropout_rate': [0.2, 0.3, 0.5],
#     'model__activation': ['tanh', 'relu'],
#     'learning_rate': [0.001, 0.01, 0.1], 
#     'batch_size': [32, 64, 128], 
#     'epochs': [100, 200, 300]
# }

# for i, split in enumerate(run_split):
#     print(f"Split_{i}")

#     X_train, y_train = split['X_train_np_binary'], split['y_train_np']
#     X_test, y_test = split['X_test_np_binary'], split['y_test_np']

#     input_shape = X_train.shape[1:]
#     print(f'Input shape: {input_shape}')

#     model = KerasRegressor(build_fn=create_lstm, input_shape=input_shape, verbose=1) # , units=100, dropout_rate=0.2, activation='tanh', learning_rate=0.001, epochs=100, batch_size=32, verbose=1)
#     # model = create_lstm(input_shape)

#     gridsearch = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) # attention j'ai déjà de la validation croisée moi, donc pas de cv ici  

#     gridsearch.fit(X_train, y_train, validation_data=(X_test, y_test), verbose=1) # callbacks=[early_stopping], verbose=1)

#     # model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, callbacks=[early_stopping], verbose=1)

#     # model_filename = f"Data/lstm_model_{taille_fenetre_to_run}_split_{i}.h5"
#     # model.save(model_filename)

#     # score = model.evaluate(X_test, y_test, verbose=0)
#     # scores.append(score)

#     print(f"Best params: {gridsearch.best_params_}")
#     print(f"Best score: {gridsearch.best_score_}")

#     best_model = gridsearch.best_estimator_
#     best_model_filename = f"Data/lstm_model_{taille_fenetre_to_run}_split_{i}.h5"
#     best_model.model.save(best_model_filename)

#     score = best_model.score(X_test, y_test)
#     scores.append(score)

In [12]:

# random forest : label encoding
# régresssion linéaire : binary encoding