# Textual entailment task

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
from tqdm import tqdm
from tensorflow.keras import Model, layers
from keras.layers.merge import concatenate
from keras_preprocessing import sequence, text
from keras.models import Sequential
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten

Using TensorFlow backend.


In [2]:
#If the file is downloaded to the format .json1 you have to rename the file and remove the "1" at the end.
#Otherwise it's impossible to read the file
datafile_fever = 'data/fever2-fixers-dev.json'
datafile_train = 'data/train.csv'

In [3]:
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [4]:
df_fever = pd.read_json(datafile_fever, lines=True)#,orient='table')
df_fever.shape

(1174, 8)

In [5]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated


In [None]:
df_fever.head()

In [None]:
df_fever[5:10]

# Data exploration

In [None]:
null_counts = df_fever.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
null_counts = df_train.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

In [None]:
# No id 247 !
#df_train['tid1'][247]

In [None]:
print("Min nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).min())
print("Min nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).min())
print("Max nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).max())
print("Max nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).max())
print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).mean())

# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [6]:
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_fever['claim']     = df_fever['claim'].str.lower().str.translate(translator)

In [None]:
df_train.head()

# LSTM 

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
vocab_size = 20000
nb_labels = 3+1
embedding_size = 300
lstm_size = 200
max_len = 35
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

In [None]:
# Hyperparameter kaggle qui me plait mucho
#MAX_SEQUENCE_LENGTH = 30
#MAX_NB_WORDS = 200000
#EMBEDDING_DIM = 300
#VALIDATION_SPLIT = 0.1

#num_lstm = 200
# #num_dense = 125
# rate_drop_lstm = 0.15 + np.random.rand() * 0.25
# rate_drop_dense = 0.15 + np.random.rand() * 0.25

# act = 'relu'
# re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

# STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
#         rate_drop_dense)

In [9]:
df_train = df_train.head(10000)

In [10]:
train_size = int(len(df_train['title1_en']) * training_portion)

train_title1 = df_train['title1_en'][0: train_size]
train_title2 = df_train['title2_en'][0: train_size]
train_labels = df_train['label'][0: train_size]

validation_titles1 = df_train['title1_en'][train_size:]
validation_titles2 = df_train['title2_en'][train_size:]

validation_labels = df_train['label'][train_size:]

In [None]:
validation_titles1

In [11]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['title1_en']+df_train['title2_en'])
#later we'll have to check the number of unknown words in the test data
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'the': 2,
 'of': 3,
 'to': 4,
 'in': 5,
 'a': 6,
 '2018': 7,
 'and': 8,
 'will': 9,
 'is': 10}

In [12]:
train_sequences1 = tokenizer.texts_to_sequences(train_title1)
print(train_sequences1[0])

[112, 23, 58, 12, 240, 85, 233, 13, 73, 28, 5, 16, 69, 22, 14, 474, 222]


In [13]:
train_sequences2 = tokenizer.texts_to_sequences(train_title2)
print(train_sequences2[10])

[259, 6, 2188, 4, 5766, 220, 48, 220, 284, 1905]


In [14]:
train_padded1 = pad_sequences(train_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)
train_padded2 = pad_sequences(train_sequences2, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [15]:
validation_sequences1 = tokenizer.texts_to_sequences(validation_titles1)
validation_sequences2 = tokenizer.texts_to_sequences(validation_titles2)

validation_padded1 = pad_sequences(validation_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)
validation_padded2 = pad_sequences(validation_sequences2, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [16]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_train['label'])

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(train_padded2[59]))
print('---')
print(train_title2[59])

In [25]:
#embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
#                                            input_length=max_len, trainable=True)
shared_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
    tf.keras.layers.LSTM(lstm_size),
    tf.keras.layers.Dense(nb_labels, activation='softmax')
])

shared_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 35, 300)           6000000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 35, 400)           801600    
_________________________________________________________________
lstm_13 (LSTM)               (None, 200)               480800    
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 804       
Total params: 7,283,204
Trainable params: 7,283,204
Non-trainable params: 0
_________________________________________________________________


In [18]:
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history_lstm = model_lstm.fit(train_padded1, training_label_seq, 
                              epochs=10,validation_data=(validation_padded1, validation_label_seq),verbose=2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
8000/8000 - 100s - loss: 0.6743 - accuracy: 0.7054 - val_loss: 0.7173 - val_accuracy: 0.7095
Epoch 2/10
8000/8000 - 101s - loss: 0.5055 - accuracy: 0.8156 - val_loss: 0.7026 - val_accuracy: 0.6940
Epoch 3/10
8000/8000 - 100s - loss: 0.4418 - accuracy: 0.8382 - val_loss: 0.7720 - val_accuracy: 0.6755
Epoch 4/10
8000/8000 - 100s - loss: 0.4028 - accuracy: 0.8446 - val_loss: 0.7842 - val_accuracy: 0.6885
Epoch 5/10
8000/8000 - 100s - loss: 0.3725 - accuracy: 0.8528 - val_loss: 0.7789 - val_accuracy: 0.7065
Epoch 6/10
8000/8000 - 102s - loss: 0.3473 - accuracy: 0.8574 - val_loss: 1.0138 - val_accuracy: 0.4785
Epoch 7/10
8000/8000 - 101s - loss: 0.3291 - accuracy: 0.8594 - val_loss: 0.8305 - val_accuracy: 0.5880
Epoch 8/10
8000/8000 - 102s - loss: 0.3189 - accuracy: 0.8605 - val_loss: 1.0291 - val_accuracy: 0.5870
Epoch 9/10
8000/8000 - 102s - loss: 0.3056 - accuracy: 0.8673 - val_loss: 1.0325 - val_accuracy: 0.5860
Epoch 10/10
8000

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
                                            input_length=max_len, trainable=True)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = Concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_size),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_size)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_size, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

In [None]:
lstm1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024,activation='tanh')),
    tf.keras.layers.Dense(128, activation='linear'),
])  
lstm1.summary()

In [None]:
lstm2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024,activation='tanh')),
    tf.keras.layers.Dense(128, activation='linear'),
]) 
lstm2.summary()

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input([lstm1,lstm2]),
    tf.keras.layers.Dense()
])
model.summary()

In [None]:
validation_padded1

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(x=[train_padded1,train_padded2], y=training_label_seq, epochs=num_epochs, validation_data=((validation_padded1,validation_padded2), validation_label_seq), verbose=2)

# BERT 