## Import

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import pickle
from gensim.models.phrases import Phrases, Phraser

from keras.preprocessing.sequence import pad_sequences
import sys
sys.path.append('../../..')
from smartFAQ.src.tokenPad import tokenization_padding
import os



## Constantes

In [None]:
BATCH_SIZE = 64
EPOCHS = 10
LSTM_NODES =256
NUM_SENTENCES = 2000 # on commence petit pour la première itération 
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

## Data

In [None]:
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

## Data processing

In [None]:
input_sentences = X_train['answer']
output_sentences = X_val['answer']

## Tokenization et Padding

In [None]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 30369
Length of longest sentence in input: 1688


In [None]:
output_sentences

402052    Tkinter import class Application Frame def ini...
840653    generate generator version map map lambda x x ...
936066    In Python Python use bisect written comments R...
370840    edit I 'll leave code I guess codec problem I ...
311412    usr bin env python says script run Python inte...
                                ...                        
9736      I wanted small clean also explicit version bas...
448661    If want keep existing format whatever reason t...
626232    Apparently cb solids set edgecolor face culpri...
416449    Fixed Had missed part earlier code file stream...
599966    I first time hours ago What turn python script...
Name: answer, Length: 10000, dtype: object

In [None]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences)

output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 29972
Length of longest sentence in the output: 2434


In [None]:
max_out_len

2434

In [None]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[72])

encoder_input_sequences.shape: (10000, 1688)
encoder_input_sequences[172]: [  0   0   0 ... 135 183  86]


In [None]:
decoder_input_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[72])

decoder_input_sequences.shape: (10000, 2434)
decoder_input_sequences[172]: [ 457   37 4819 ...    0    0    0]


In [None]:
MAX_NUM_WORDS = 20000
MAX_LEN = 3000

In [None]:
# x_train_pad = tokenization_padding(X_train, 'answer', ['question'], MAX_NUM_WORDS, MAX_LEN)
# = tokenization_padding(X_train, 'answer', [], MAX_NUM_WORDS, MAX_LEN)

In [None]:
# x_val_pad = tokenization_padding(X_val, 'answer', ['question'], MAX_NUM_WORDS, MAX_LEN)
#x_val_pad = tokenization_padding(X_val, 'answer', [], MAX_NUM_WORDS, MAX_LEN)

## Word Embedding

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'../../data/glove.6B.100d.txt', encoding="utf8")


#On se crée un dictionnaire pour pouvoir facilement travailler avec les vecteurs de glove, en insérant en cé chaque mot et en valeur la liste des vecteurs
for line in glove_file:
    records = line.split()
    #le mot
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32') # on récupère la liste des valeurs
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [None]:
len(word2idx_inputs)

30369

In [None]:
type(word2idx_inputs)

dict

In [None]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
# on initilialise une matrice vide que l'on va remplir
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))

for word, index in list(word2idx_inputs.items())[:20000]:

    embedding_vector = embeddings_dictionary.get(word)
#     print(embedding_vector)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## Model

In [None]:
#On crée une matrice vide à l'aide de np.zeros
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        MAX_LEN,
        MAX_NUM_WORDS, # if doesn't looking this --> num_words_output
    ),
    dtype='float32'
)

In [None]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
# print(decoder_output_sequences)
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        print(decoder_targets_one_hot[i, t, word])
        decoder_targets_one_hot[i, t, word] = 1

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


KernelInterrupted: Execution interrupted by the Jupyter kernel.

NameError: name 'pad_sequences' is not defined

In [None]:
decoder_targets_one_hot[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Encoder

In [None]:
print(LSTM_NODES)

256


In [None]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = Embedding(MAX_NUM_WORDS, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

## Decoder

In [None]:
max_out_len

2434

In [None]:
#on utilise une fois de plus la couche Embedding pour utiliser la prédiction précédente
decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_outputs, _, _ = LSTM(LSTM_NODES, return_sequences=True, return_state=True)(decoder_inputs_x, initial_state=encoder_states)

## Fully connected final pour générer la prédiction
On rajoute une couche dense après le décoder

In [None]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Reconstruction

In [None]:
#La syntaxe est un peu différente car notre modèle doit avoir deux entrées, la phrase anglaise pour l'encoder et le token sos pour le decoder
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)

In [None]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

### Affichage du modèle 

In [None]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1688)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2434)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1688, 100)    2000000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2434, 256)    7673088     input_2[0][0]                    
_______________________________________________________________________________________

# Entrainement

In [None]:
decoder_targets_one_hot

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [None]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

NameError: name 'encoder_input_sequences' is not defined

KernelInterrupted: Execution interrupted by the Jupyter kernel.

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
# © Mouna DAHAMANI 2021

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>