# Textual entailment task

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
from tqdm import tqdm
from tensorflow.keras import Model, layers
from keras_preprocessing import sequence, text

In [2]:
#If the file is downloaded to the format .json1 you have to rename the file and remove the "1" at the end.
#Otherwise it's impossible to read the file
datafile_fever = 'data/fever2-fixers-dev.json'
datafile_train = 'data/train.csv'

In [3]:
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [4]:
df_fever = pd.read_json(datafile_fever, lines=True)#,orient='table')
df_fever.shape

(1174, 8)

In [5]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated


In [6]:
df_fever.head()

Unnamed: 0,id,label,claim,evidence,original_id,transformation,attack,annotation
0,500000,NOT ENOUGH INFO,There is a convicted statutory rapist called C...,"[[[269158, None, None, None]]]",225798.0,label_preserving,there.is.a.called,
1,500001,SUPPORTS,There exists a producer and an actor called Si...,"[[[141141, 156349, Simon_Pegg, 0]]]",120126.0,label_preserving,there.exists.a.called,
2,500002,REFUTES,Exotic Birds rejected to be an opening band fo...,"[[[25977, 31918, Exotic_Birds, 2], [25977, 319...",,,word replacement,OK - Claim is grammatical and label supported ...
3,500003,REFUTES,The Nice Guys is a 2016 American neo-noir acti...,"[[[None, None, The_Nice_Guys, 0], [None, None,...",,,Multihop,OK - Claim is grammatical and label supported ...
4,500004,REFUTES,Rupert Murdoch's father was not connected to a...,"[[[None, None, Rupert_Murdoch, 1], [None, None...",,,Multihop,OK - Claim is grammatical and label supported ...


In [7]:
df_fever[5:10]

Unnamed: 0,id,label,claim,evidence,original_id,transformation,attack,annotation
5,500005,NOT ENOUGH INFO,"There exists an award-winning TV series, it go...","[[[22421, None, None, None]]]",5743.0,label_preserving,there.exists.a.that.goes.by.name.of.prn,
6,500006,NOT ENOUGH INFO,Omar Khadr was declared guilty and was detaine...,"[[[None, None, None, None]]]",,,conjunction,UN - Claim is grammatical but label is incorre...
7,500007,SUPPORTS,Robert Kardashian is an ex-husband of a mother...,"[[[None, None, Robert_Kardashian, 2], [None, N...",,,long chain of relations,OK - Claim is grammatical and label supported ...
8,500008,Not Enough Info,Antoine Berjon have studied medicine in his ea...,"[[[58, 97, None, None]]]",,,NotClear,UN - Claim is grammatical but label is incorre...
9,500009,SUPPORTS,There is not a natural element that goes by th...,"[[[130895, 145673, Moscovium, 0]], [[130895, 1...",111503.0,complex_negate,there.is.not.by.name,


# Data exploration

In [8]:
null_counts = df_fever.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

transformation    676
original_id       676
dtype: int64

In [9]:
null_counts = df_train.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

title2_zh    7
dtype: int64

In [10]:
df_train['tid1'][247]

KeyError: 247

# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [11]:
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_fever['claim']     = df_fever['claim'].str.lower().str.translate(translator)

In [12]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,there are two new oldage insurance benefits fo...,police disprove birds nest congress each perso...,unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,if you do not come to shenzhen sooner or later...,the gdp overtopped hong kong shenzhen clarifie...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,if you do not come to shenzhen sooner or later...,shenzhens gdp topped hong kong last year shenz...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,if you do not come to shenzhen sooner or later...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,if you do not come to shenzhen sooner or later...,shenzhens gdp overtakes hong kong bureau of st...,unrelated


# LSTM ?

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
vocab_size = 5000
embedding_dim = 64
max_length = 30
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [15]:
df_train = df_train.head(10000)

In [16]:
train_size = int(len(df_train['title1_en']) * training_portion)

train_title1 = df_train['title1_en'][0: train_size]
train_title2 = df_train['title2_en'][0: train_size]
train_labels = df_train['label'][0: train_size]

validation_titles1 = df_train['title1_en'][train_size:]
validation_labels = df_train['label'][train_size:]

In [17]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_title1)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'the': 2,
 'of': 3,
 'in': 4,
 'to': 5,
 '2018': 6,
 'a': 7,
 'will': 8,
 'and': 9,
 'be': 10}

In [18]:
train_sequences = tokenizer.texts_to_sequences(train_title1)
print(train_sequences[10])

[25, 5, 1915, 146, 69, 1916, 146, 29, 1313, 3, 1622]


In [19]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [20]:
validation_sequences = tokenizer.texts_to_sequences(validation_titles1)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [21]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_train['label'])

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [22]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[56]))
print('---')
print(train_title1[56])

i love selfies a sellout i i and i say how do you get smarter i ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
 i  love selfies a sellout   i   i  and i say how do you get smarter   i 


In [23]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
8000/8000 - 14s - loss: 0.6758 - accuracy: 0.7196 - val_loss: 0.6785 - val_accuracy: 0.6995
Epoch 2/10
8000/8000 - 10s - loss: 0.4276 - accuracy: 0.8227 - val_loss: 0.7235 - val_accuracy: 0.7005
Epoch 3/10
8000/8000 - 10s - loss: 0.3713 - accuracy: 0.8396 - val_loss: 0.7702 - val_accuracy: 0.5740
Epoch 4/10
8000/8000 - 10s - loss: 0.3507 - accuracy: 0.8495 - val_loss: 0.8215 - val_accuracy: 0.7185
Epoch 5/10
8000/8000 - 10s - loss: 0.3325 - accuracy: 0.8508 - val_loss: 0.8126 - val_accuracy: 0.7055
Epoch 6/10
8000/8000 - 10s - loss: 0.3207 - accuracy: 0.8546 - val_loss: 0.7778 - val_accuracy: 0.7120
Epoch 7/10
8000/8000 - 10s - loss: 0.3103 - accuracy: 0.8572 - val_loss: 0.8497 - val_accuracy: 0.6140
Epoch 8/10
8000/8000 - 10s - loss: 0.3006 - accuracy: 0.8601 - val_loss: 0.8991 - val_accuracy: 0.6250
Epoch 9/10


# BERT ?

# Deep Learning ?