In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
df = pd.read_csv('./data/sample.csv',sep=';')

In [None]:
df.head()

In [None]:
seqs = df['PARCOURS_BENEF_ID']
seqs

In [None]:
# Parameters
history_depth = 1

In [None]:
# Create sequences

NX=[]
NY=[]
Nseq = []

for idx,row in seqs.items():
    seq = row.split('|')
    seq = [i for i in seq]
    Nseq.append(seq)
    
    for i in range(history_depth):
        seq = [0] + seq
    seq = seq + [0]
    for x in range(len(seq)-history_depth):  
        NX.append(seq[x:x+history_depth])
        NY.append(seq[x+history_depth])  
    
X = np.array(NX)
Y = np.array(NY)

In [None]:
# Create csv file for CPT algo trial
with open('./data/output_seqs.csv','a') as f:
    for row in Nseq:
        line = ','.join(row) + '\n'
        f.write(line)
        

In [None]:
type(X)
pd.DataFrame(X).to_csv("./data/output_seqs2.csv",header=False,index=False)

In [None]:
n_records = X.shape[0]
print("%d sequences generated" % n_records)

In [None]:
# create mapping of unique cat to integers
cat = np.unique(X)
nb_cat = len(cat)
cat_to_int = dict((c, i) for i, c in enumerate(cat))

In [None]:
cat_to_int

In [None]:
# translate X and Y with dictionnary
XX = [cat_to_int[i] for i in X.reshape(-1)]
YY = [cat_to_int[i] for i in Y.reshape(-1)]

In [None]:
# Reshape
X = np.array(XX).reshape((-1,history_depth,1))
# Normalize
X = X / float(nb_cat)

In [None]:
# one hot encode Y
from keras.utils import np_utils
y = np_utils.to_categorical(YY)

In [None]:
from keras.backend import clear_session

clear_session()
model = None

In [None]:
# Define model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential()

model.add(LSTM(100,input_shape=(history_depth,1),return_sequences=True))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(nb_cat, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
          
print(model.summary())

In [None]:
# Train model
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', patience=3, verbose=1)

history = model.fit(X,y,epochs=10, batch_size=128, validation_split=0.2, callbacks=[es])

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca()
plt.show()

In [None]:
# save cat_dict
filename = './cat_dict.pkl'
pickle.dump(cat_to_int, open(filename, 'wb'))

# Save LSTM model
model.save('./model.h5')

In [None]:
print(history_depth)

In [None]:
# sample sequence used to predict next
xtest = ['26','18','0']

In [None]:
# preprocess
xtest_dict = np.array([cat_to_int[i] for i in xtest])
xtest_norm = xtest_dict / float(len(cat_to_int))
x = xtest_norm.reshape((1,history_depth,1))

In [None]:
# predcition
p = model.predict(x)

In [None]:
# reverse dictionnary
int_to_cat = {v: k for k, v in cat_to_int.items()}

# decode
print('next in sequence: [%s] -- with probability %.1f %%' % (int_to_cat[np.argmax(p)],p.max()*100))

In [None]:
model.predict_classes(x)