# Textual entailment task

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
import itertools
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras_preprocessing import sequence, text

In [5]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [6]:
#If the file is downloaded to the format .json1 you have to rename the file and remove the "1" at the end.
#Otherwise it's impossible to read the file
datafile_fever = 'data/fever2-fixers-dev.json'
datafile_train = 'data/train.csv'

In [7]:
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [8]:
df_fever = pd.read_json(datafile_fever, lines=True)#,orient='table')
df_fever.shape

(1174, 8)

In [9]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated


In [10]:
df_fever.head()

Unnamed: 0,id,label,claim,evidence,original_id,transformation,attack,annotation
0,500000,NOT ENOUGH INFO,There is a convicted statutory rapist called C...,"[[[269158, None, None, None]]]",225798.0,label_preserving,there.is.a.called,
1,500001,SUPPORTS,There exists a producer and an actor called Si...,"[[[141141, 156349, Simon_Pegg, 0]]]",120126.0,label_preserving,there.exists.a.called,
2,500002,REFUTES,Exotic Birds rejected to be an opening band fo...,"[[[25977, 31918, Exotic_Birds, 2], [25977, 319...",,,word replacement,OK - Claim is grammatical and label supported ...
3,500003,REFUTES,The Nice Guys is a 2016 American neo-noir acti...,"[[[None, None, The_Nice_Guys, 0], [None, None,...",,,Multihop,OK - Claim is grammatical and label supported ...
4,500004,REFUTES,Rupert Murdoch's father was not connected to a...,"[[[None, None, Rupert_Murdoch, 1], [None, None...",,,Multihop,OK - Claim is grammatical and label supported ...


In [11]:
df_fever[5:10]

Unnamed: 0,id,label,claim,evidence,original_id,transformation,attack,annotation
5,500005,NOT ENOUGH INFO,"There exists an award-winning TV series, it go...","[[[22421, None, None, None]]]",5743.0,label_preserving,there.exists.a.that.goes.by.name.of.prn,
6,500006,NOT ENOUGH INFO,Omar Khadr was declared guilty and was detaine...,"[[[None, None, None, None]]]",,,conjunction,UN - Claim is grammatical but label is incorre...
7,500007,SUPPORTS,Robert Kardashian is an ex-husband of a mother...,"[[[None, None, Robert_Kardashian, 2], [None, N...",,,long chain of relations,OK - Claim is grammatical and label supported ...
8,500008,Not Enough Info,Antoine Berjon have studied medicine in his ea...,"[[[58, 97, None, None]]]",,,NotClear,UN - Claim is grammatical but label is incorre...
9,500009,SUPPORTS,There is not a natural element that goes by th...,"[[[130895, 145673, Moscovium, 0]], [[130895, 1...",111503.0,complex_negate,there.is.not.by.name,


# Data exploration

In [12]:
null_counts = df_fever.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

transformation    676
original_id       676
dtype: int64

In [13]:
null_counts = df_train.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

title2_zh    7
dtype: int64

In [14]:
# No id 247 !
#df_train['tid1'][247]

In [15]:
print("Min nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).min())
print("Min nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).min())
print("Max nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).max())
print("Max nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).max())
print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).mean())

Min nb words title 1  : 1
Min nb words title 2  : 1
Max nb words title 1  : 500
Max nb words title 2  : 539
Mean nb words title 1 : 16.383588310164967
Mean nb words title 2 : 16.572528014175546


# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [16]:
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_fever['claim']     = df_fever['claim'].str.lower().str.translate(translator)

In [17]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,there are two new oldage insurance benefits fo...,police disprove birds nest congress each perso...,unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,if you do not come to shenzhen sooner or later...,the gdp overtopped hong kong shenzhen clarifie...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,if you do not come to shenzhen sooner or later...,shenzhens gdp topped hong kong last year shenz...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,if you do not come to shenzhen sooner or later...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,if you do not come to shenzhen sooner or later...,shenzhens gdp overtakes hong kong bureau of st...,unrelated


# LSTM 

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [19]:
vocab_size = 15000
nb_labels = 3+1
embedding_size = 200
lstm_size = 200
max_len = 35
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

In [20]:
# Hyperparameter kaggle qui me plait mucho
#MAX_SEQUENCE_LENGTH = 30
#MAX_NB_WORDS = 200000
#EMBEDDING_DIM = 300
#VALIDATION_SPLIT = 0.1

#num_lstm = 200
# #num_dense = 125
# rate_drop_lstm = 0.15 + np.random.rand() * 0.25
# rate_drop_dense = 0.15 + np.random.rand() * 0.25

# act = 'relu'
# re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

# STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
#         rate_drop_dense)

In [21]:
df_train = df_train.head(1000)

In [22]:
train_size = int(len(df_train['title1_en']) * training_portion)

x_train = df_train[['title1_en','title2_en']][0:train_size]
y_train = df_train['label'][0:train_size]
x_validation = df_train[['title1_en','title2_en']][train_size:]
y_validation = df_train['label'][train_size:]

In [23]:
y_validation

id
801     unrelated
802     unrelated
803     unrelated
804     unrelated
805     unrelated
          ...    
996     unrelated
997     unrelated
998        agreed
999        agreed
1000       agreed
Name: label, Length: 200, dtype: object

In [24]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['title1_en']+df_train['title2_en'])
#later we'll have to check the number of unknown words in the test data
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'the': 2,
 'of': 3,
 'to': 4,
 'a': 5,
 'and': 6,
 'is': 7,
 'in': 8,
 'be': 9,
 'will': 10}

In [25]:
X = {'title1': x_train['title1_en'], 'title2': x_train['title2_en']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [26]:
X_val = {'title1': x_validation['title1_en'], 'title2': x_validation['title2_en']}

for x_validation_seq, side in itertools.product([X_val], ['title1', 'title2']):
    x_validation_seq[side] = tokenizer.texts_to_sequences(x_validation_seq[side])
    x_validation_seq[side] = pad_sequences(x_validation_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [27]:
x_validation_seq['title1'][0]

array([ 74, 130, 782, 201, 292,   6, 292, 510, 645, 783, 626,  73, 784,
       335,   2, 640,   3, 203, 426, 179,   9,  27,   4,   1,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

In [28]:
# train_sequences1 = tokenizer.texts_to_sequences(train_title1)
# print(train_sequences1[0])

In [29]:
# train_sequences2 = tokenizer.texts_to_sequences(train_title2)
# print(train_sequences2[10])

In [30]:
# train_padded1 = pad_sequences(train_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)
# train_padded2 = pad_sequences(train_sequences2, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [31]:
# validation_sequences1 = tokenizer.texts_to_sequences(validation_titles1)
# validation_sequences2 = tokenizer.texts_to_sequences(validation_titles2)

# validation_padded1 = pad_sequences(validation_sequences1, maxlen=max_len, padding=padding_type, truncating=trunc_type)
# validation_padded2 = pad_sequences(validation_sequences2, maxlen=max_len, padding=padding_type, truncating=trunc_type)

In [32]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_train['label'])

y_train_seq = label_tokenizer.texts_to_sequences(y_train)
y_validation_seq = label_tokenizer.texts_to_sequences(y_validation)

In [33]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][59]))
print('---')
print(x_train['title2_en'][59])

the 315s the rumour spinach is a greased vegetable that can be made iron by eating it ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
the 315s the rumour spinach is a greased vegetable that can be made iron by eating it


In [34]:
#embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
#                                            input_length=max_len, trainable=True)
shared_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
   # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_size, return_sequences=True)),
    tf.keras.layers.LSTM(lstm_size),
    #tf.keras.layers.Dense(nb_labels, activation='softmax')
])

shared_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 35, 200)           3000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 35, 400)           641600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               480800    
Total params: 4,122,400
Trainable params: 4,122,400
Non-trainable params: 0
_________________________________________________________________


In [35]:
title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')

In [36]:
lstm1 = shared_model(title1_input)
lstm2 = shared_model(title2_input)

In [37]:
lstm1

<tf.Tensor 'sequential/Identity:0' shape=(None, 200) dtype=float32>

In [38]:
merged = tf.keras.layers.concatenate([lstm1,lstm2])

In [39]:
merged = tf.keras.layers.Dense(4, activation='relu')(merged)

In [40]:
output = tf.keras.layers.Dense(4, activation='softmax')(merged)

In [41]:
model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])

In [42]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 200)          4122400     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 400)          0           sequential[1][0]             

In [43]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
num_epochs = 10
trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(y_train_seq),
                           epochs=num_epochs,
                           validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))
                         , verbose=2)

Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
# embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
#                                             input_length=max_len, trainable=True)
# lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences_1 = embedding_layer(sequence_1_input)
# x1 = lstm_layer(embedded_sequences_1)

# sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences_2 = embedding_layer(sequence_2_input)
# y1 = lstm_layer(embedded_sequences_2)

# merged = Concatenate([x1, y1])
# merged = Dropout(rate_drop_dense)(merged)
# merged = BatchNormalization()(merged)

# merged = Dense(num_dense, activation=act)(merged)
# merged = Dropout(rate_drop_dense)(merged)
# merged = BatchNormalization()(merged)

# preds = Dense(1, activation='sigmoid')(merged)

# BERT 