In [1]:
import pickle
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Conv1D, Flatten, Dropout, MaxPooling1D

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import os

import sys
sys.path.append('../../..')
from smartFAQ.src.tokenPad import tokenization_padding

## Load w2v model & Data

In [2]:
# with open('../../data/w2v_pretrained_weights.pickle', 'rb') as handle:
#     w2v_model = pickle.load(handle)
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

In [4]:
# pretrained_weights = w2v_model.wv.syn0
# vocab_size, embedding_size = pretrained_weights.shape

In [5]:
def normalize(arr, t_min, t_max): 
    norm_arr = [] 
    diff = t_max - t_min 
    diff_arr = max(arr) - min(arr) 
    for i in arr: 
        temp = (((i - min(arr))*diff)/diff_arr) + t_min 
        norm_arr.append(temp) 
    
    res = arr.to_frame()
    res['norm'] = norm_arr
    return res['norm'] 

In [6]:
y_train = normalize(y_train, 0, 1)
y_val = normalize(y_val, 0, 1)

## Tokenization & Padding

In [7]:
MAX_NUM_WORDS = 20000
MAX_LEN = 3000

In [8]:
# x_train_pad = tokenization_padding(X_train, 'answer', ['question'], MAX_NUM_WORDS, MAX_LEN)
x_train_pad = tokenization_padding(X_train, 'answer', [], MAX_NUM_WORDS, MAX_LEN)

In [9]:
# x_val_pad = tokenization_padding(X_val, 'answer', ['question'], MAX_NUM_WORDS, MAX_LEN)
x_val_pad = tokenization_padding(X_val, 'answer', [], MAX_NUM_WORDS, MAX_LEN)

## Keras Model

In [15]:
# from gensim.models import Word2Vec
# from gensim.models.phrases import Phrases, Phraser

In [16]:
# sent = []
# for col in ['question', 'answer']:
#     words = [row.split() for row in X_train[col]]
#     sent = sent + words
# sentences = [[word for word in document.lower().split()] for document in X_train['answer']]

In [17]:
# phrases = Phrases(sent)
# bigram = Phraser(phrases)
# sentences = bigram[sentences]

In [18]:
# word_model = Word2Vec(sentences, size=200, min_count = 1, window = 5)

In [19]:
# with open('../../data/w2v.pickle', 'wb') as handle:
#     pickle.dump(word_model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../../data/w2v.pickle', 'rb') as handle:
    word_model = pickle.load(handle)

In [20]:
# TOO BIG, KERNEL STOPPED
# from gensim.models import KeyedVectors
# filename = '../../data/GoogleNews-vectors-negative300.bin'
# word_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [21]:
pretrained_weights = word_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [22]:
word_model.most_similar(positive=["python"])

  """Entry point for launching an IPython kernel.


[('perl', 0.8226622343063354),
 ('compiled', 0.8225566744804382),
 ('vim', 0.7626760601997375),
 ('versions', 0.7565158605575562),
 ('compiler', 0.7547906637191772),
 ('swig', 0.7527791857719421),
 ('cliche', 0.7517421245574951),
 ('standard', 0.7496249675750732),
 ('native', 0.7473969459533691),
 ('python_interpreter', 0.7460223436355591)]

In [23]:
def build_model(vocab_size,embedding_size,pretrained_weights):
    
           
    model = tf.keras.Sequential()
    
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_size, 
                        weights=[pretrained_weights],
                        input_length=MAX_LEN
#                         input_length=max_input_len
                       ))

    model.add(Conv1D(128,5, activation='relu'))
#     model.add(Conv1D(128,5, activation='relu'))
    # Flatten layer to reduce the three-dimensional output to two dimensional for concatenation
#     model.add(Flatten())
#     model.add(Dropout(.2, input_shape=(2,)))
    model.add(MaxPooling1D(pool_size=2))
    
#     model.add(Dense(10, activation='tanh'))
#     model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    return model

In [24]:
# MAE : the average absolute distance between the predicted and target values
def compile_model(model):
    mse = tf.keras.losses.MeanSquaredError()
    model.compile(loss=mse,
                  optimizer='adam',
                  metrics=['mean_squared_error', 'mean_absolute_error']
                  )
    return  model

In [25]:
def fit_model( x_train, y_train, x_val, y_val, model, batch_size,  epochs = 5):
    
    print('Train...')
    os.makedirs("./logs/CNN_logs_6",exist_ok=True)
    tensorboard = TensorBoard(log_dir=os.path.join('./logs/CNN_logs_6'), histogram_freq=0,
                                  write_graph=True, write_images=False,profile_batch = 100000000)

    # This callback will stop the training when there is no improvement in
    # the validation loss for 2 consecutive epochs.
    Es= EarlyStopping(monitor='loss', patience=2)
    
    callbacks = [Es, tensorboard]

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(x_val, y_val),
              callbacks= callbacks)
    return model

In [26]:
with tf.device("/CPU:0"):
    model = build_model(vocab_size, embedding_size, pretrained_weights)
    model.summary()
    model = compile_model(model)
    model = fit_model(
        x_train_pad,
        y_train, 
        x_val_pad,
        y_val, model, 
        batch_size=100, 
        epochs=3)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3000, 200)         12093600  
_________________________________________________________________
conv1d (Conv1D)              (None, 2996, 128)         128128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1498, 128)         0         
_________________________________________________________________
flatten (Flatten)            (None, 191744)            0         
_________________________________________________________________
dense (Dense)                (None, 1)                 191745    
Total params: 12,413,473
Trainable params: 12,413,473
Non-trainable params: 0
_________________________________________________________________
Train...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [27]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('./saved_model/CNN_model_6')

INFO:tensorflow:Assets written to: ./saved_model/CNN_model_6/assets


In [28]:
loss, mean_squared_error, mean_ab = model.evaluate(x_val_pad, y_val)



In [30]:
# © Laëtitia CONSTANTIN 2021

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>