# Textual entailment task

In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
import itertools
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras_preprocessing import sequence, text

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
#If the file is downloaded to the format .json1 you have to rename the file and remove the "1" at the end.
#Otherwise it's impossible to read the file
datafile_fever = 'data/fever2-fixers-dev.json'
datafile_train = 'data/train.csv'

In [4]:
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [5]:
df_test = pd.read_csv('data/test.csv', index_col='id').sort_index()
#df_test = df_test.head(1000)
df_test.shape

(80126, 6)

In [6]:
df_fever = pd.read_json(datafile_fever, lines=True)#,orient='table')
df_fever.shape

(1174, 8)

In [7]:
#df_train.head()

In [8]:
#df_fever.head()

In [9]:
#df_fever[5:10]

# Data exploration

In [10]:
# null_counts = df_fever.isnull().sum()
# null_counts[null_counts > 0].sort_values(ascending=False)

In [11]:
# null_counts = df_train.isnull().sum()
# null_counts[null_counts > 0].sort_values(ascending=False)

In [12]:
# No id 247 !
#df_train['tid1'][247]

#### Since the label repartition is bad, 68% are unrelated, LSTM could give more often than expected this label, that's why BERT model has to be implemented to compare the 2 models

In [13]:
# from collections import Counter
# Counter(df_train.label)

In [14]:
# 219313/len(df_train)

In [15]:
# print("Min nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).min())
# print("Min nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).min())
# print("Max nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).max())
# print("Max nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).max())
# print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).mean())
# print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).mean())

# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [16]:
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_test['title1_en']  = df_test['title1_en'].str.lower().str.translate(translator)
df_test['title2_en']  = df_test['title2_en'].str.lower().str.translate(translator)
df_fever['claim']     = df_fever['claim'].str.lower().str.translate(translator)

In [17]:
#df_train.head()

# LSTM 

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [19]:
vocab_size = 10000
nb_labels = 3
embedding_size = 100
lstm_size = 100
max_len = 35
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

In [20]:
df_train = df_train.head(10000)

In [21]:
train_size = int(len(df_train['title1_en']) * training_portion)

x_train = df_train[['title1_en','title2_en']][0:train_size]
y_train = df_train['label'][0:train_size]
x_validation = df_train[['title1_en','title2_en']][train_size:]
y_validation = df_train['label'][train_size:]
x_test = df_test[['title1_en','title2_en']]

In [22]:
#y_validation

In [23]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['title1_en']+df_train['title2_en'])
#later we'll have to check the number of unknown words in the test data
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'the': 2,
 'of': 3,
 'to': 4,
 'in': 5,
 'a': 6,
 '2018': 7,
 'and': 8,
 'will': 9,
 'is': 10}

In [24]:
X = {'title1': x_train['title1_en'], 'title2': x_train['title2_en']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [25]:
X_val = {'title1': x_validation['title1_en'], 'title2': x_validation['title2_en']}

for x_validation_seq, side in itertools.product([X_val], ['title1', 'title2']):
    x_validation_seq[side] = tokenizer.texts_to_sequences(x_validation_seq[side])
    x_validation_seq[side] = pad_sequences(x_validation_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [26]:
X = {'title1': x_test['title1_en'], 'title2': x_test['title2_en']}

for x_test_seq, side in itertools.product([X], ['title1', 'title2']):
    x_test_seq[side] = tokenizer.texts_to_sequences(x_test_seq[side])
    x_test_seq[side] = pad_sequences(x_test_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [27]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_train['label'])

y_train_seq = label_tokenizer.texts_to_sequences(y_train)
y_validation_seq = label_tokenizer.texts_to_sequences(y_validation)
word_index_label = label_tokenizer.word_index
dict(list(word_index_label.items())[0:10])

{'unrelated': 1, 'agreed': 2, 'disagreed': 3}

In [28]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][59]))
print('---')
print(x_train['title2_en'][59])

the 315s the rumour spinach is a greased vegetable that can be made iron by eating it ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
the 315s the rumour spinach is a greased vegetable that can be made iron by eating it


In [29]:
shared_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
    tf.keras.layers.LSTM(lstm_size),
    #tf.keras.layers.Dense(nb_labels, activation='softmax')
])

shared_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 35, 100)           1000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 35, 200)           160800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               120400    
Total params: 1,281,200
Trainable params: 1,281,200
Non-trainable params: 0
_________________________________________________________________


In [30]:
title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')

In [31]:
lstm1 = shared_model(title1_input)
lstm2 = shared_model(title2_input)

In [32]:
lstm1

<tf.Tensor 'sequential/Identity:0' shape=(None, 100) dtype=float32>

In [33]:
merged = tf.keras.layers.concatenate([lstm1,lstm2])

In [34]:
merged = tf.keras.layers.Dense(4, activation='relu')(merged)

In [35]:
output = tf.keras.layers.Dense(4, activation='softmax')(merged)

In [36]:
model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])

In [37]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 100)          1281200     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 200)          0           sequential[1][0]             

In [38]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
num_epochs = 10
trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(y_train_seq),
                           epochs=num_epochs,
                           validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
                         , verbose=2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/10




Epoch 2/10




Epoch 3/10




Epoch 4/10




Epoch 5/10




Epoch 6/10




Epoch 7/10




Epoch 8/10




Epoch 9/10




Epoch 10/10






### Predictions on test data

In [40]:
preds = model.predict([x_test_seq['title1'], x_test_seq['title2']], verbose=1)
#preds += model.predict([x_test_seq['title2'], x_test_seq['title1']], verbose=1)
#preds /= 2



 - 246s 3ms/sample


In [45]:
preds

array([[1.4370457e-04, 6.0671502e-01, 3.8316748e-01, 9.9737570e-03],
       [1.5589511e-03, 4.4741649e-02, 9.5165676e-01, 2.0425969e-03],
       [1.5216429e-05, 9.5796877e-01, 2.2034671e-02, 1.9981317e-02],
       ...,
       [3.9497036e-03, 7.0949930e-01, 7.5444028e-02, 2.1110700e-01],
       [1.0813887e-02, 6.0856962e-01, 1.0800331e-01, 2.7261311e-01],
       [6.5109832e-04, 8.3938020e-01, 4.0137284e-02, 1.1983143e-01]],
      dtype=float32)

In [42]:
results = []
for i in range(len(preds)):
    maxi = 0
    index = 0
    for j in range(4):
        if preds[i][j]>maxi:
            maxi = preds[i][j]
            index = j
    results.append(index)

In [43]:
pred_labels = []
for a in results:
    if a ==1:
        pred_labels.append("unrelated")
    elif a == 2:
        pred_labels.append("agreed")
    else:
        pred_labels.append("disagreed")
#pred_labels

In [44]:
with open('sample_submission.csv', 'w', newline='') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['Id','Category'])
    for i in range(len(pred_labels)):
        writer.writerow([df_test.index[i], pred_labels[i]])