In [None]:
import pandas as pd
import pickle
import tensorflow as tf
import numpy as np

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Conv1D, Flatten, Dropout, MaxPooling1D

import sys
sys.path.append('../../..')
from smartFAQ.src.tokenPad import tokenization_padding

import os

from keras.models import Sequential
print('Setup is ready')

# Load Data
---
## Load Data

In [None]:
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

## Tokenization & Padding

In [None]:
MAX_NUM_WORDS = 20000
MAX_LEN = 3000
x_train_pad = tokenization_padding(X_train, 'answer', [], MAX_NUM_WORDS, MAX_LEN)
x_val_pad = tokenization_padding(X_val, 'answer', [], MAX_NUM_WORDS, MAX_LEN)

## Keras Model

In [None]:
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from gensim.models.phrases import Phrases, Phraser

In [None]:
with open('../../data/d2v_pretrained_weights.pickle', 'rb') as handle:
    d2v_pretrained_weights = pickle.load(handle)

In [None]:
pretrained_weights = d2v_pretrained_weights.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

In [7]:
d2v_pretrained_weights.most_similar(['import']) 

  """Entry point for launching an IPython kernel.


[('import_sys', 0.32976454496383667),
 ('os', 0.2879202663898468),
 ('logging_basicConfig', 0.25194600224494934),
 ('ext_import', 0.24858161807060242),
 ('Popen_PIPE', 0.24675637483596802),
 ('pyc_byte', 0.2391386777162552),
 ('numpy_np', 0.23757658898830414),
 ('Have_look', 0.2326200157403946),
 ('env_python', 0.22709797322750092),
 ('lazy_dog', 0.21828103065490723)]

In [8]:
matrix = np.zeros((30000, 300))
for i in range(30000):
    matrix[i] = d2v_pretrained_weights.infer_vector([X_train['answer'].values[i]])

In [9]:
x_train_vec = matrix
x_train_vec = x_train_vec.reshape(x_train_vec.shape[0], 1, x_train_vec.shape[1])

In [10]:
x_train_vec.shape

(30000, 1, 300)

In [11]:
matrix_val = np.zeros((X_val.shape[0], 300))
for i in range(X_val.shape[0]):
    matrix_val[i] = d2v_pretrained_weights.infer_vector([X_val['answer'].values[i]])

In [12]:
x_val_vec = matrix_val
x_val_vec = x_val_vec.reshape(x_val_vec.shape[0], 1, x_val_vec.shape[1])

In [13]:
pretrained_weights.shape

(6569, 300)

In [78]:
def build_model(x_train): 
    
    model = tf.keras.Sequential()
    input_shape = x_train.shape
#     model.add(Conv1D(filters=64, kernel_size=1, activation='relu', padding='valid', input_shape=input_shape[1:]))
    model.add(Dense(32, activation="sigmoid", input_shape=input_shape[1:]))
    model.add(MaxPooling1D(1))
    model.add(Dropout(0.2))
#     model.add(Conv1D(filters=32, kernel_size=1, padding='same', activation='relu'))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    return model

In [90]:
# MAE : the average absolute distance between the predicted and target values
def compile_model(model):
    mse = tf.keras.losses.MeanSquaredError()
    model.compile(loss=mse,
                  optimizer='Nadam',
                  metrics=['mean_squared_error', 'mean_absolute_error']
                  )
    return  model

In [91]:
def fit_model( x_train, y_train, x_val, y_val, model, batch_size,  epochs = 5):
    
    print('Train...')
    os.makedirs("./logs/Dense_doc2vec_logs_3",exist_ok=True)
    tensorboard = TensorBoard(log_dir=os.path.join('./logs/Dense_doc2vec_logs_3'), histogram_freq=0,
                                  write_graph=True, write_images=False,profile_batch = 100000000)

    # This callback will stop the training when there is no improvement in
    # the validation loss for 2 consecutive epochs.
    Es= EarlyStopping(monitor='loss', patience=3)
    
    callbacks = [Es, tensorboard]

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(x_val, y_val),
              callbacks= callbacks)
    return model

In [81]:
x_train_vec.shape

(30000, 1, 300)

In [82]:
x_val_vec.shape

(14000, 1, 300)

In [92]:
with tf.device("/CPU:0"):
    model = build_model(x_train_vec)
    model.summary()
    model = compile_model(model)
    model = fit_model(
        x_train_vec,
        y_train, 
        x_val_vec,
        y_val, model, 
        batch_size=100, 
        epochs=10)

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_54 (Dense)             (None, 1, 32)             9632      
_________________________________________________________________
max_pooling1d_33 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
flatten_21 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_55 (Dense)             (None, 1)                 33        
Total params: 9,665
Trainable params: 9,665
Non-trainable params: 0
_________________________________________________________________
Train...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/

## Save model 

In [93]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('./saved_model/Dense_model_d2v_3')

INFO:tensorflow:Assets written to: ./saved_model/CNN_model_d2v_3/assets


© Laëtitia CONSTANTIN & Axel CHENU 2021

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>