In [22]:
import pickle
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Conv1D, Flatten, Dropout

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping

import os

## Load w2v model & Data

In [4]:
# with open('../../data/w2v_pretrained_weights.pickle', 'rb') as handle:
#     w2v_model = pickle.load(handle)
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

In [3]:
# pretrained_weights = w2v_model.wv.syn0
# vocab_size, embedding_size = pretrained_weights.shape

## Tokenization & Padding

In [2]:
MAX_NUM_WORDS = 20000

In [5]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(X_train['answer'])
input_integer_seq = input_tokenizer.texts_to_sequences(X_train['answer'])

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 30369
Length of longest sentence in input: 1688


In [6]:
x_train_pad = pad_sequences(input_integer_seq, maxlen=max_input_len)
# x_train_pad = pad_sequences(input_integer_seq, maxlen=MAX_SEQ_LEN)
print("encoder_input_sequences.shape:", x_train_pad.shape)
print("encoder_input_sequences[172]:", x_train_pad[72])

encoder_input_sequences.shape: (10000, 1688)
encoder_input_sequences[172]: [  0   0   0 ... 135 183  86]


In [7]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(X_val['answer'])
input_integer_seq = input_tokenizer.texts_to_sequences(X_val['answer'])
x_val_pad = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", x_val_pad.shape)
print("encoder_input_sequences[172]:", x_val_pad[72])

encoder_input_sequences.shape: (10000, 1688)
encoder_input_sequences[172]: [   0    0    0 ... 5837 5419 5838]


## Keras Model

In [12]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

In [18]:
sentences = [[word for word in document.lower().split()] for document in X_train['answer']]

In [17]:
sentences

<gensim.models.phrases.Phrases at 0x7fe792a07898>

In [19]:
phrases = Phrases(sentences)
bigram = Phraser(phrases)
sentences = bigram[sentences]

In [20]:
word_model = Word2Vec(sentences, size=200, min_count = 1, window = 5)

In [9]:
# sentences

In [21]:
pretrained_weights = word_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [27]:
def build_model(vocab_size,embedding_size,pretrained_weights):
    
           
    model = tf.keras.Sequential()
    
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_size, 
                        weights=[pretrained_weights],
                        input_length=max_input_len
                       ))

    model.add(Conv1D(128,5, activation='relu'))
    model.add(Conv1D(128,5, activation='relu'))
    
    model.add(Flatten())
    model.add(Dropout(.2, input_shape=(2,)))
    
    model.add(Dense(10, activation='relu'))
    model.add(Activation('relu'))
    model.add(Dense(1, activation='sigmoid'))

    return model

In [37]:
# MAE : the average absolute distance between the predicted and target values
def compile_model(model):
    mae = tf.keras.losses.MeanAbsolutePercentageError()
    model.compile(loss=mae,
                  optimizer='adam',
                  metrics=['mean_squared_error', 'mean_absolute_error']
                  )
    return  model

In [38]:
def fit_model( x_train, y_train, x_val, y_val, model, batch_size,  epochs = 5):
    
    print('Train...')
    os.makedirs("./logs/CNN_logs",exist_ok=True)
    tensorboard = TensorBoard(log_dir=os.path.join('./logs/CNN_logs'), histogram_freq=0,
                                  write_graph=True, write_images=False,profile_batch = 100000000)

    # This callback will stop the training when there is no improvement in
    # the validation loss for 2 consecutive epochs.
    Es= EarlyStopping(monitor='loss', patience=2)
    
    callbacks = [Es, tensorboard]

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_val, y_val),
              callbacks= callbacks)
    return model

In [39]:
with tf.device("/CPU:0"):
    model = build_model(vocab_size, embedding_size, pretrained_weights)
    model.summary()
    model = compile_model(model)
    model = fit_model(x_train_pad, y_train, x_val_pad, y_val, model, batch_size=200, epochs=3)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1688, 200)         6710200   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 1684, 128)         128128    
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 1680, 128)         82048     
_________________________________________________________________
flatten_4 (Flatten)          (None, 215040)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 215040)            0         
_________________________________________________________________
dense_8 (Dense)              (None, 10)                2150410   
_________________________________________________________________
activation_4 (Activation)    (None, 10)               

In [47]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('./saved_model/CNN_model')

INFO:tensorflow:Assets written to: ./saved_model/CNN_model/assets


In [43]:
loss, mean_squared_error, mean_ab = model.evaluate(x_val_pad, y_val)



In [44]:
# prediction = model.predict(x_test_pad)

In [None]:
# © Laëtitia CONSTANTIN 2021

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>