# Textual entailment task

In [1]:
import os
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
import itertools
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras_preprocessing import sequence, text
from tensorboard.plugins.hparams import api as hp
import tensorflow.keras.backend as K

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
datafile_train = 'data/train.csv'

In [4]:
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [5]:
df_test = pd.read_csv('data/test.csv', index_col='id').sort_index()
df_test = df_test.head(1000)
df_test.shape

(1000, 6)

# Data exploration

In [6]:
# null_counts = df_train.isnull().sum()
# null_counts[null_counts > 0].sort_values(ascending=False)

In [7]:
# No id 247 !
#df_train['tid1'][247]

#### Since the label repartition is bad, 68% are unrelated, LSTM could give more often than expected this label, that's why BERT model has to be implemented to compare the 2 models

In [8]:
# from collections import Counter
# Counter(df_train.label)

In [9]:
# 219313/len(df_train)

In [10]:
# print("Min nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).min())
# print("Min nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).min())
# print("Max nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).max())
# print("Max nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).max())
# print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).quantile(0.99))
print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).quantile(0.99))

Mean nb words title 2 : 16.572528014175546
Mean nb words title 1 : 30.0
Mean nb words title 2 : 30.0


# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [11]:
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_test['title1_en']  = df_test['title1_en'].str.lower().str.translate(translator)
df_test['title2_en']  = df_test['title2_en'].str.lower().str.translate(translator)

In [12]:
#df_train.head()

# LSTM 

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


### Hyper parameters definition

In [14]:
vocab_size = 10000
nb_labels = 3
embedding_size = 100
# lstm_size = 100
max_len = 30
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

In [15]:
#HP_VOCAB_SIZE = hp.HParam('vocab_size',hp.Discrete([10000,15000,20000]))
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([50, 100, 125]))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([64, 128, 256]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1, 0.2, 0.3]))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([0.001, 0.003, 0.01, 0.03, 0.1]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'rmsprop']))
HP_EMBEDDING_LAYER = hp.HParam('embedding', hp.Discrete(['pretrained', 'from_scratch']))
#HP_ATTENTION = hp.HParam('attention', hp.Discrete(['yes', 'no']))

In [16]:
METRIC_ACCURACY = 'accuracy'
METRIC_F1_MAC = 'f1_macro' #f1 per class then averaged
METRIC_F1_MIC = 'f1_micro' #global average of each inidvidual instances
METRIC_PRECISION = 'precision'
METRIC_RECALL = 'recall'
METRIC_LOSS = 'loss'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_OPTIMIZER, HP_NUM_UNITS, HP_BATCH_SIZE, HP_DROPOUT, HP_EMBEDDING_LAYER,HP_LEARNING_RATE], #,HP_ATTENTION
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy'),
                 hp.Metric(METRIC_F1_MIC, display_name='F1 Micro'),
                 hp.Metric(METRIC_LOSS, display_name='Loss'),
                 hp.Metric(METRIC_PRECISION, display_name='Precision'),
                 hp.Metric(METRIC_RECALL, display_name='Recall')],
      )

In [17]:
#df_train = df_train.head(1000)

### Word embedding

In [18]:
train_size = int(len(df_train['title1_en']) * training_portion)

x_train = df_train[['title1_en','title2_en']][0:train_size]
y_train = df_train['label'][0:train_size]
x_validation = df_train[['title1_en','title2_en']][train_size:]
y_validation = df_train['label'][train_size:]
x_test = df_test[['title1_en','title2_en']]

In [19]:
#y_validation

In [20]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['title1_en']+df_train['title2_en'])
#later we'll have to check the number of unknown words in the test data
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'the': 2,
 'of': 3,
 'to': 4,
 'a': 5,
 'and': 6,
 'is': 7,
 'in': 8,
 'be': 9,
 'will': 10}

In [21]:
X = {'title1': x_train['title1_en'], 'title2': x_train['title2_en']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [22]:
X_val = {'title1': x_validation['title1_en'], 'title2': x_validation['title2_en']}

for x_validation_seq, side in itertools.product([X_val], ['title1', 'title2']):
    x_validation_seq[side] = tokenizer.texts_to_sequences(x_validation_seq[side])
    x_validation_seq[side] = pad_sequences(x_validation_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [23]:
X = {'title1': x_test['title1_en'], 'title2': x_test['title2_en']}

for x_test_seq, side in itertools.product([X], ['title1', 'title2']):
    x_test_seq[side] = tokenizer.texts_to_sequences(x_test_seq[side])
    x_test_seq[side] = pad_sequences(x_test_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [24]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_train['label'])

y_train_seq = label_tokenizer.texts_to_sequences(y_train)
y_validation_seq = label_tokenizer.texts_to_sequences(y_validation)
word_index_label = label_tokenizer.word_index
dict(list(word_index_label.items())[0:10])

{'unrelated': 1, 'agreed': 2, 'disagreed': 3}

In [25]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][59]))
print('---')
print(x_train['title2_en'][59])

the 315s the rumour spinach is a greased vegetable that can be made iron by eating it ? ? ? ? ? ? ? ? ? ? ? ? ?
---
the 315s the rumour spinach is a greased vegetable that can be made iron by eating it


In [26]:
embeddings_index = {}
GLOVE_DIR='data/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [27]:
words_not_fount = 0
embedding_matrix = np.random.random((len(word_index) + 1, embedding_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_fount += 1


In [None]:
words_not_fount

### Model creation working without HP parameters

In [None]:
shared_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
    tf.keras.layers.LSTM(100),
    #tf.keras.layers.Dense(nb_labels, activation='softmax')
])

shared_model.summary()

In [None]:
title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')

In [None]:
lstm1 = shared_model(title1_input)
lstm2 = shared_model(title2_input)

In [None]:
lstm1

In [None]:
merged = tf.keras.layers.concatenate([lstm1,lstm2])

In [None]:
merged = tf.keras.layers.Dense(4, activation='relu')(merged)

In [None]:
output = tf.keras.layers.Dense(4, activation='softmax')(merged)

In [None]:
model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])

In [None]:
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 4
trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(y_train_seq),
                           epochs=num_epochs,
                           validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
                         , verbose=2)

In [None]:
loss, accuracy = model.evaluate([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))

### Model creation with HP parameters

In [28]:
def get_opt_algo(algo, learning_rate):
    #Depending on the optimization algo specified create the algo object with specified learning rate
    if algo == 'rmsprop':
        opt_algo = tf.keras.optimizers.RMSprop(learning_rate)
    elif algo == 'adam':
        opt_algo = tf.keras.optimizers.Adam(learning_rate)
    else:#For now it defaults to SGD
        opt_algo = tf.keras.optimizers.SGD(learning_rate)
    return opt_algo

In [29]:
def getEmbeddingLayer(embedding):
    if embedding == 'pretrained':
        embedding_layer = tf.keras.layers.Embedding(len(word_index) + 1,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=True)
    else:
        embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
                                            input_length=max_len, trainable=True)
    return embedding_layer

In [30]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [33]:
def train_and_test_model(hparams):
    shared_model = tf.keras.models.Sequential([
        getEmbeddingLayer(hparams[HP_EMBEDDING_LAYER]),
        #tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hparams[HP_NUM_UNITS], return_sequences=True)),
       # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
       # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
        tf.keras.layers.LSTM(hparams[HP_NUM_UNITS]),
        #tf.keras.layers.Dense(nb_labels, activation='softmax')
    ])
    title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    lstm1 = shared_model(title1_input)
    lstm2 = shared_model(title2_input)
    merged = tf.keras.layers.concatenate([lstm1,lstm2])
    dense = tf.keras.layers.Dense(4, activation='relu')(merged)
    output = tf.keras.layers.Dense(4, activation='softmax')(dense)
    model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])
    opt_algo = get_opt_algo(hparams[HP_OPTIMIZER], hparams[HP_LEARNING_RATE])

    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt_algo, metrics=['accuracy'])#,
#                                                                                       tf.keras.metrics.Precision(),
#                                                                                       tf.keras.metrics.Recall(),
#                                                                                       get_f1])
    num_epochs = 2
    trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(y_train_seq),
                           epochs=num_epochs,
                           validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
                         , verbose=2)

    loss, accuracy = model.evaluate([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
    return loss, accuracy#, prec, recall, f1

In [32]:
# def train_and_test_model(hparams):
    #sequence_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
#     title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
#     title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    
#     shared_model = tf.keras.models.Sequential([
#         getEmbeddingLayer(hparams[HP_EMBEDDING_LAYER]),#(title1_input),
#         #tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
#         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hparams[HP_NUM_UNITS],dropout=hparams[HP_DROPOUT], return_sequences=True)),
#        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
#        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
#         tf.keras.layers.LSTM(hparams[HP_NUM_UNITS])
#         #tf.keras.layers.Dense(nb_labels, activation='softmax')
#     ])
#     lstm1 = shared_model(title1_input)
#     lstm2 = shared_model(title2_input)
# #     embedding_layer1 = getEmbeddingLayer(hparams[HP_EMBEDDING_LAYER])(title1_input)
# #     embedding_layer2 = getEmbeddingLayer(hparams[HP_EMBEDDING_LAYER])(title2_input)

# #     bi_lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=hparams[HP_NUM_UNITS], 
# #                                             return_sequences = True))(embedding_layer1)
# #     bi_lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=hparams[HP_NUM_UNITS], 
# #                                             return_sequences = True))(embedding_layer2)
#     #lstm1 = tf.keras.layers.LSTM(units=hparams[HP_NUM_UNITS])(bi_lstm1)
#     #lstm2 = tf.keras.layers.LSTM(units=hparams[HP_NUM_UNITS])(bi_lstm2)
#     merged = tf.keras.layers.concatenate([lstm1,lstm2])
#     dense = tf.keras.layers.Dense(4, activation='relu')(merged)
#     output = tf.keras.layers.Dense(4, activation='softmax')(dense)
#     model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])
#     opt_algo = get_opt_algo(hparams[HP_OPTIMIZER], hparams[HP_LEARNING_RATE])

#     model.compile(loss='sparse_categorical_crossentropy', optimizer=opt_algo, metrics=['accuracy'])#,
# #                                                                                       tf.keras.metrics.Precision(),
# #                                                                                       tf.keras.metrics.Recall(),
# #                                                                                       get_f1])
#     num_epochs = 4
#     trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(y_train_seq),
#                                epochs=num_epochs,
#                                validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
#                                , verbose = 2)#,batch_size = hparams[HP_BATCH_SIZE])   
# #     ret_sequences = False
# #     if hparams[HP_ATTENTION] == 'yes':
# #         ret_sequences = True
# #     lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
# #                                                             (tf.keras.layers.LSTM
# #                                                              (units=hparams[HP_NUM_UNITS],
# #                                                               dropout=hparams[HP_DROPOUT],
# #                                                               activation='tanh',
# #                                                               return_sequences=ret_sequences,
# #                                                               return_state=True))(lstm)
# #     state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
# #     if hparams[HP_ATTENTION] == 'yes':
# #         context_vector, attention_weights = Attention(10)(lstm, state_h)
# #         output = tf.keras.layers.Dense(units=1, activation='sigmoid')(context_vector)
# #     else:
# #         output = tf.keras.layers.Dense(units=1, activation='sigmoid')(lstm)
#     #opt_algo = get_opt_algo(hparams[HP_OPTIMIZER], hparams[HP_LEARNING_RATE])
#     #model = tf.keras.Model(inputs=sequence_input, outputs=output)
    
# #     model.compile(optimizer=opt_algo, loss='binary_crossentropy',
# #                   metrics=['accuracy', 
# #                            tf.keras.metrics.Precision(),
# #                            tf.keras.metrics.Recall(), 
# #                            get_f1])
#     #model.fit(x_train[:100], y_train[:100], epochs=3, batch_size = hparams[HP_BATCH_SIZE])

#     loss, accuracy, prec, recall, f1 = model.evaluate([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
#     return loss, accuracy, prec, recall, f1

In [34]:
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        loss, accuracy = train_and_test_model(hparams)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
        tf.summary.scalar(METRIC_LOSS, loss, step=1)
#         tf.summary.scalar(METRIC_F1_MAC, f1, step=1)
#         tf.summary.scalar(METRIC_PRECISION, prec, step=1)
#         tf.summary.scalar(METRIC_RECALL, recall, step=1)

In [35]:
session_num = 0

for optimizer in HP_OPTIMIZER.domain.values:
    for num_units in HP_NUM_UNITS.domain.values:
        for batch_size in HP_BATCH_SIZE.domain.values:
            for dropout_rate in HP_DROPOUT.domain.values:
                #for vocab_size in HP_VOCAB_SIZE.domain.values:
                for learning_rate in HP_LEARNING_RATE.domain.values:
                    for embedding_layer in HP_EMBEDDING_LAYER.domain.values:
                            #for attention in HP_ATTENTION.domain.values:
                        hparams = {
                            HP_NUM_UNITS: num_units,
                            HP_DROPOUT: dropout_rate,
                            HP_OPTIMIZER: optimizer,
                            HP_EMBEDDING_LAYER: embedding_layer,
                            #HP_VOCAB_SIZE: vocab_size,
                            HP_LEARNING_RATE: learning_rate,
                            HP_BATCH_SIZE: batch_size,
                            #HP_ATTENTION: attention
                            }
                        run_name = "run-%d" % session_num
                        print('--- Starting trial: %s' % run_name)
                        print({h.name: hparams[h] for h in hparams})
                        run('logs/hparam_tuning/' + run_name, hparams)
                        session_num += 1

--- Starting trial: run-0
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.0829 - accuracy: 0.6162 - val_loss: 1.1488 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.8191 - accuracy: 0.6162 - val_loss: 1.1308 - val_accuracy: 0.5150
--- Starting trial: run-1
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 20s - loss: 1.1614 - accuracy: 0.5975 - val_loss: 1.2301 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 2s - loss: 1.0928 - accuracy: 0.6162 - val_loss: 1.2220 - val_accuracy: 0.5150


--- Starting trial: run-2
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 16s - loss: 0.9905 - accuracy: 0.6162 - val_loss: 1.0410 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.7295 - accuracy: 0.6850 - val_loss: 1.1881 - val_accuracy: 0.4650


--- Starting trial: run-3
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 16s - loss: 1.0217 - accuracy: 0.4112 - val_loss: 1.3361 - val_accuracy: 0.5050
Epoch 2/2
800/800 - 2s - loss: 0.8381 - accuracy: 0.6212 - val_loss: 1.1547 - val_accuracy: 0.4500


--- Starting trial: run-4
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 16s - loss: 0.9221 - accuracy: 0.5250 - val_loss: 1.1707 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.7143 - accuracy: 0.6625 - val_loss: 1.4557 - val_accuracy: 0.5150


--- Starting trial: run-5
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 16s - loss: 0.8447 - accuracy: 0.5975 - val_loss: 1.1198 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.8014 - accuracy: 0.6162 - val_loss: 1.1504 - val_accuracy: 0.5150


--- Starting trial: run-6
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 15s - loss: 1.1530 - accuracy: 0.5263 - val_loss: 1.1314 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.8687 - accuracy: 0.6200 - val_loss: 1.1043 - val_accuracy: 0.4950


--- Starting trial: run-7
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 14s - loss: 1.1560 - accuracy: 0.5875 - val_loss: 1.1209 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.9009 - accuracy: 0.6162 - val_loss: 1.1008 - val_accuracy: 0.5150


--- Starting trial: run-8
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.0145 - accuracy: 0.5775 - val_loss: 1.2074 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.8049 - accuracy: 0.5725 - val_loss: 1.2320 - val_accuracy: 0.5150


--- Starting trial: run-9
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 20s - loss: 1.0113 - accuracy: 0.5612 - val_loss: 1.0598 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 3s - loss: 0.8022 - accuracy: 0.6187 - val_loss: 1.1201 - val_accuracy: 0.5150


--- Starting trial: run-10
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 25s - loss: 0.9993 - accuracy: 0.6162 - val_loss: 1.1054 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8004 - accuracy: 0.6162 - val_loss: 1.1320 - val_accuracy: 0.5150


--- Starting trial: run-11
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 26s - loss: 1.1741 - accuracy: 0.4137 - val_loss: 1.2707 - val_accuracy: 0.3300
Epoch 2/2
800/800 - 5s - loss: 0.9644 - accuracy: 0.3450 - val_loss: 1.1724 - val_accuracy: 0.3300


--- Starting trial: run-12
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 38s - loss: 1.1554 - accuracy: 0.5738 - val_loss: 1.3108 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 6s - loss: 0.9019 - accuracy: 0.6975 - val_loss: 1.4519 - val_accuracy: 0.5400


--- Starting trial: run-13
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 19s - loss: 1.3603 - accuracy: 0.5788 - val_loss: 1.3404 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 4s - loss: 1.2957 - accuracy: 0.6162 - val_loss: 1.2995 - val_accuracy: 0.5150


--- Starting trial: run-14
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 19s - loss: 0.9736 - accuracy: 0.6162 - val_loss: 1.1302 - val_accuracy: 0.5550
Epoch 2/2
800/800 - 5s - loss: 0.6879 - accuracy: 0.6787 - val_loss: 1.1480 - val_accuracy: 0.5150


--- Starting trial: run-15
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 22s - loss: 1.1343 - accuracy: 0.5750 - val_loss: 1.4423 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8903 - accuracy: 0.6162 - val_loss: 1.1517 - val_accuracy: 0.5100


--- Starting trial: run-16
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 20s - loss: 0.9091 - accuracy: 0.5975 - val_loss: 1.1388 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 6s - loss: 0.8135 - accuracy: 0.5713 - val_loss: 1.0375 - val_accuracy: 0.5150


--- Starting trial: run-17
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 105s - loss: 0.9345 - accuracy: 0.5938 - val_loss: 1.0831 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.7855 - accuracy: 0.6250 - val_loss: 1.1030 - val_accuracy: 0.4900


--- Starting trial: run-18
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 21s - loss: 0.9636 - accuracy: 0.5838 - val_loss: 1.0533 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8167 - accuracy: 0.6162 - val_loss: 1.0372 - val_accuracy: 0.5150


--- Starting trial: run-19
{'num_units': 50, 'dropout': 0.2, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 19s - loss: 1.0032 - accuracy: 0.5238 - val_loss: 1.1742 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8103 - accuracy: 0.6162 - val_loss: 1.1534 - val_accuracy: 0.5150


--- Starting trial: run-20
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 216s - loss: 1.0387 - accuracy: 0.5888 - val_loss: 1.0942 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.7981 - accuracy: 0.6162 - val_loss: 1.2051 - val_accuracy: 0.5150


--- Starting trial: run-21
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.001, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 32s - loss: 1.1597 - accuracy: 0.4800 - val_loss: 1.5113 - val_accuracy: 0.5000
Epoch 2/2
800/800 - 5s - loss: 1.0140 - accuracy: 0.5400 - val_loss: 1.4930 - val_accuracy: 0.4550


--- Starting trial: run-22
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 25s - loss: 1.3556 - accuracy: 0.5238 - val_loss: 1.3390 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 1.2940 - accuracy: 0.6162 - val_loss: 1.2978 - val_accuracy: 0.5150


--- Starting trial: run-23
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.003, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 359s - loss: 0.8834 - accuracy: 0.5900 - val_loss: 1.2206 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 4s - loss: 0.7924 - accuracy: 0.6162 - val_loss: 1.2351 - val_accuracy: 0.5050


--- Starting trial: run-24
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.1306 - accuracy: 0.6075 - val_loss: 1.1416 - val_accuracy: 0.5900
Epoch 2/2
800/800 - 4s - loss: 0.7785 - accuracy: 0.7387 - val_loss: 1.2562 - val_accuracy: 0.5150


--- Starting trial: run-25
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.01, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 0.9251 - accuracy: 0.5663 - val_loss: 1.1486 - val_accuracy: 0.5050
Epoch 2/2
800/800 - 4s - loss: 0.7885 - accuracy: 0.6263 - val_loss: 1.1250 - val_accuracy: 0.5000


--- Starting trial: run-26
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 18s - loss: 0.9480 - accuracy: 0.6162 - val_loss: 1.1502 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 4s - loss: 0.7971 - accuracy: 0.6187 - val_loss: 1.1071 - val_accuracy: 0.5150


--- Starting trial: run-27
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.03, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 22s - loss: 0.9553 - accuracy: 0.5775 - val_loss: 1.0825 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 4s - loss: 0.8005 - accuracy: 0.6162 - val_loss: 1.1093 - val_accuracy: 0.5150


--- Starting trial: run-28
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.0706 - accuracy: 0.5650 - val_loss: 1.2304 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8139 - accuracy: 0.5663 - val_loss: 1.0889 - val_accuracy: 0.5150


--- Starting trial: run-29
{'num_units': 50, 'dropout': 0.3, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.1, 'batch_size': 64}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 18s - loss: 0.9518 - accuracy: 0.5487 - val_loss: 1.1733 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 4s - loss: 0.8130 - accuracy: 0.6162 - val_loss: 1.1599 - val_accuracy: 0.5150


--- Starting trial: run-30
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.001, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.1253 - accuracy: 0.6150 - val_loss: 1.4495 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.9393 - accuracy: 0.6162 - val_loss: 1.4285 - val_accuracy: 0.5150


--- Starting trial: run-31
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.001, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 521s - loss: 1.0336 - accuracy: 0.3350 - val_loss: 1.1474 - val_accuracy: 0.3300
Epoch 2/2
800/800 - 5s - loss: 0.9276 - accuracy: 0.3450 - val_loss: 1.1361 - val_accuracy: 0.3300


--- Starting trial: run-32
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.003, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 18s - loss: 1.2356 - accuracy: 0.4550 - val_loss: 1.4329 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 1.0175 - accuracy: 0.6162 - val_loss: 1.3437 - val_accuracy: 0.5150


--- Starting trial: run-33
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.003, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.1632 - accuracy: 0.3350 - val_loss: 1.3542 - val_accuracy: 0.3300
Epoch 2/2
800/800 - 5s - loss: 0.9334 - accuracy: 0.5163 - val_loss: 1.2805 - val_accuracy: 0.5100


--- Starting trial: run-34
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.01, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 19s - loss: 0.9353 - accuracy: 0.5938 - val_loss: 1.1622 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 6s - loss: 0.8203 - accuracy: 0.6162 - val_loss: 1.1347 - val_accuracy: 0.5150


--- Starting trial: run-35
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'pretrained', 'learning_rate': 0.01, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2
800/800 - 17s - loss: 1.0398 - accuracy: 0.6087 - val_loss: 1.2910 - val_accuracy: 0.5150
Epoch 2/2
800/800 - 5s - loss: 0.8237 - accuracy: 0.6162 - val_loss: 1.2080 - val_accuracy: 0.5150


--- Starting trial: run-36
{'num_units': 50, 'dropout': 0.1, 'optimizer': 'adam', 'embedding': 'from_scratch', 'learning_rate': 0.03, 'batch_size': 128}
Train on 800 samples, validate on 200 samples
Epoch 1/2


KeyboardInterrupt: 

### Predictions on test data working without HP parameters

In [None]:
preds = model.predict([x_test_seq['title1'], x_test_seq['title2']], verbose=1)
#preds += model.predict([x_test_seq['title2'], x_test_seq['title1']], verbose=1)
#preds /= 2

In [None]:
preds

In [None]:
results = []
for i in range(len(preds)):
    maxi = 0
    index = 0
    for j in range(4):
        if preds[i][j]>maxi:
            maxi = preds[i][j]
            index = j
    results.append(index)

In [None]:
pred_labels = []
for a in results:
    if a ==1:
        pred_labels.append("unrelated")
    elif a == 2:
        pred_labels.append("agreed")
    else:
        pred_labels.append("disagreed")
#pred_labels

In [None]:
with open('sample_submission.csv', 'w', newline='') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['Id','Category'])
    for i in range(len(pred_labels)):
        writer.writerow([df_test.index[i], pred_labels[i]])