In [1]:
import pickle
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Conv1D, Flatten

from tensorflow.keras.callbacks import TensorBoard

import os

## Load w2v model & Data

In [2]:
# with open('../../data/w2v_pretrained_weights.pickle', 'rb') as handle:
#     w2v_model = pickle.load(handle)
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

In [3]:
# pretrained_weights = w2v_model.wv.syn0
# vocab_size, embedding_size = pretrained_weights.shape

## Tokenization & Padding

In [4]:
MAX_NUM_WORDS = 20000

In [5]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(X_train['answer'])
input_integer_seq = input_tokenizer.texts_to_sequences(X_train['answer'])

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 30369
Length of longest sentence in input: 1688


In [6]:
x_train_pad = pad_sequences(input_integer_seq, maxlen=max_input_len)
# x_train_pad = pad_sequences(input_integer_seq, maxlen=MAX_SEQ_LEN)
print("encoder_input_sequences.shape:", x_train_pad.shape)
print("encoder_input_sequences[172]:", x_train_pad[72])

encoder_input_sequences.shape: (10000, 1688)
encoder_input_sequences[172]: [  0   0   0 ... 135 183  86]


In [7]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(X_val['answer'])
input_integer_seq = input_tokenizer.texts_to_sequences(X_val['answer'])
x_val_pad = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", x_val_pad.shape)
print("encoder_input_sequences[172]:", x_val_pad[72])

encoder_input_sequences.shape: (10000, 1688)
encoder_input_sequences[172]: [   0    0    0 ... 5837 5419 5838]


## Keras Model

In [8]:
from gensim.models import Word2Vec
sentences = [[word for word in document.lower().split()] for document in X_train['answer']]

word_model = Word2Vec(sentences, size=200, min_count = 1, window = 5)

In [9]:
# sentences

In [10]:
pretrained_weights = word_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [11]:
def build_model(vocab_size,embedding_size,pretrained_weights):
    
           
    model = tf.keras.Sequential()
    
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_size, 
                        weights=[pretrained_weights],
                        input_length=max_input_len
                       ))

    model.add(Conv1D(128,5, activation='relu'))
    model.add(Conv1D(128,5, activation='relu'))
    
    model.add(Flatten())
    
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
#     model.add(LSTM(units=embedding_size))
#     model.add(Dense(units=vocab_size))
#     model.add(Activation('softmax'))
#     model.add(Dense(1))

    return model

In [12]:
def compile_model(model):
    mse = tf.keras.losses.MeanSquaredError()
    model.compile(loss=mse,
                  optimizer='adam',
                  metrics=['MeanSquaredError']
                  )
    return  model

In [13]:
def fit_model( x_train, y_train, x_val, y_val, model, batch_size,  epochs = 5):
    
    print('Train...')
    os.makedirs("./logs",exist_ok=True)
    tensorboard = TensorBoard(log_dir=os.path.join('./logs'), histogram_freq=0,
                                  write_graph=True, write_images=False,profile_batch = 100000000)

    callbacks = [ tensorboard]

    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_val, y_val),
              callbacks= callbacks)
    return model

In [14]:
with tf.device("/CPU:0"):
    model = build_model(vocab_size, embedding_size, pretrained_weights)
    model.summary()
    model = compile_model(model)
    model = fit_model(x_train_pad, y_train, x_val_pad, y_val, model, batch_size=200, epochs=10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1688, 200)         6074000   
_________________________________________________________________
conv1d (Conv1D)              (None, 1684, 128)         128128    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1680, 128)         82048     
_________________________________________________________________
flatten (Flatten)            (None, 215040)            0         
_________________________________________________________________
dense (Dense)                (None, 10)                2150410   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 8,434,597
Trainable params: 8,434,597
Non-trainable params: 0
______________________________________________

In [18]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('./saved_model/CNN_model')

INFO:tensorflow:Assets written to: ./saved_model/CNN_model/assets


In [16]:
(loss, acc) = model.evaluate(x_val_pad, y_val)

