In [1]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
data = dataiku.Dataset("new_train_data")
df = data.get_dataframe()

In [2]:
import math

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import time

import os

# Split en dataset de Train et Test

In [4]:
# Split into training, test datasets.
# Since it's timeseries we should do it by date.
test_cutoff_date = df['date'].max() - timedelta(days=7)

test_df = df[df['date'] > test_cutoff_date]
train_df = df[df['date'] <= test_cutoff_date]

#check out the datasets
print('Test dates: {} to {}'.format(test_df['date'].min(), test_df['date'].max()))
print('Train dates: {} to {}'.format(train_df['date'].min(), train_df['date'].max()))

Test dates: 2021-08-18 23:15:00+00:00 to 2021-08-25 23:00:00+00:00
Train dates: 2021-06-25 06:15:00+00:00 to 2021-08-18 23:00:00+00:00


In [5]:
train_df  = train_df.set_index('date')
test_df = test_df.set_index('date')

In [6]:
train = train_df
scalers={}
for i in train_df.columns:
    scaler = MinMaxScaler(feature_range=(-1,1))
    s_s = scaler.fit_transform(train[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+ i] = scaler
    train[i]=s_s
test = test_df
for i in test_df.columns:
    scaler = scalers['scaler_'+i]
    s_s = scaler.transform(test[i].values.reshape(-1,1))
    s_s=np.reshape(s_s,len(s_s))
    scalers['scaler_'+i] = scaler
    test[i]=s_s

# Construction des mini-batch

In [7]:
def split_series(series, n_past, n_future):
  #
  # n_past ==> no of past observations
  #
  # n_future ==> no of future observations
  #
  X, y = list(), list()
  for window_start in range(len(series)):
    past_end = window_start + n_past
    future_end = past_end + n_future
    if future_end > len(series):
      break
    # slicing the past and future parts of the window
    past, future = series[window_start:past_end, :], series[past_end:future_end, :]
    X.append(past)
    y.append(future)
  return np.array(X), np.array(y)

In [8]:
n_past = 10
n_future = 5
n_features = 10

In [9]:
X_train, y_train = split_series(train.values,n_past, n_future)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1],n_features))
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], n_features))
X_test, y_test = split_series(test.values,n_past, n_future)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1],n_features))
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], n_features))

# Création du modèle de forecasting

In [10]:
encoder_inputs = tf.keras.layers.Input(shape=(n_past, n_features))
encoder = tf.keras.layers.LSTM(100, return_state=True)
encoder_outputs = encoder(encoder_inputs)

encoder_states = encoder_outputs[1:]

#
decoder_inputs = tf.keras.layers.RepeatVector(n_future)(encoder_outputs[0])

#
decoder = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states)
decoder_outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_features))(decoder)

#
model = tf.keras.models.Model(encoder_inputs,decoder_outputs)

#
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  tensor_proto.tensor_content = nparray.tostring()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 10, 10)]     0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 100), (None, 44400       input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 5, 100)       0           lstm[0][0]                       
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 5, 100)       80400       repeat_vector[0][0]              
                                                                 lstm[0][1]                   

# Apprentissage du modèle

In [11]:
reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber(), metrics=[tf.keras.metrics.CosineSimilarity(), tf.keras.metrics.MeanAbsoluteError()])
history = model.fit(X_train,y_train,epochs=25,validation_data=(X_test,y_test),batch_size=32,verbose=0,callbacks=[reduce_lr])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  tensor_proto.tensor_content = nparray.tostring()
  tensor_proto.tensor_content = nparray.tostring()


In [12]:
print(history.history.keys())
history.history["val_mean_absolute_error"]

data_metrics = {}

for key, val in history.history.items():
    data_metrics[key] = [val[-1]]

print(data_metrics)

dict_keys(['loss', 'cosine_similarity', 'mean_absolute_error', 'val_loss', 'val_cosine_similarity', 'val_mean_absolute_error', 'lr'])
{'loss': [0.0006516715799273057], 'cosine_similarity': [0.99830264], 'mean_absolute_error': [0.011575467], 'val_loss': [0.003760342661752573], 'val_cosine_similarity': [0.99280125], 'val_mean_absolute_error': [0.023268443], 'lr': [7.976644e-05]}


In [13]:
for index,i in enumerate(train_df.columns):
    scaler = scalers['scaler_'+i]
    y_train[:,:,index]=scaler.inverse_transform(y_train[:,:,index])
    y_test[:,:,index]=scaler.inverse_transform(y_test[:,:,index])

# Stockage du modèle dans le folder

In [14]:
model_json = model.to_json()

In [15]:
# Write recipe outputs
model_folder = dataiku.Folder("dHoUQGRB")
model_folder_info = model_folder.get_info()

now = time.time()

model_folder.write_json(str(now)+"/model_json", model_json)

In [16]:
model_folder.list_paths_in_partition()

['/1630336795.623466/model_json',
 '/1630336889.8789692/model_json',
 '/1630338570.2025137/model_json',
 '/1630338600.8940783/model_json',
 '/1630338765.6485183/model_json',
 '/1630339593.7991953/model_json',
 '/1630340919.9128914/model_json',
 '/1630397807.9655938/model_json',
 '/actual/model_json']

# Stockage des métriques dans un dataframe

In [0]:
metrics = dataiku.Dataset("Metrics")
df_metrics = metrics.get

In [48]:
if data_metrics["val_mean_absolute_error"] < df_metrics[df_metrics["used"]==True]["val_mean_absolute_error"].all():
    
    data_metrics["used"] = ["True"]
    df_metrics = df_metrics["used"].where(df_metrics["used"]=="True", "False")
    model_folder.write_json("actual/model_json", model_json)
else:
    
    data_metrics["used"] = ["False"]
    
df_metrics = df_metrics.append(pd.DataFrame(data_metrics))

  res_values = method(rvalues)


In [49]:
df_metrics

Unnamed: 0,0,cosine_similarity,loss,lr,mean_absolute_error,time,used,val_cosine_similarity,val_loss,val_mean_absolute_error
0,False,,,,,,,,,
0,,0.998303,0.000652,8e-05,0.011575,1630398000.0,True,0.992801,0.00376,0.023268


In [0]:
metrics.write_with_schema(df_metrics)

In [0]:
model_folder.write_json("actual/model_json", model_json)