# Assignment 2

## Preliminary Steps

Let's import all the needed packages

In [2]:
# Handle files and unzip
import os
import requests
import zipfile

import re

import pandas as pd
import numpy as np

# Neural Networks
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Word tokenization
from tensorflow.keras.preprocessing.text import Tokenizer


import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
!pip install tqdm
from tqdm import tqdm as tq 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Let's download the dataset

In [3]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

Look inside our dataset creating a first dataframe reading the `test_pairs.csv` file

In [4]:
test_path = os.path.join(os.getcwd(), 'dataset', 'train_pairs.csv')

df0 = pd.read_csv(test_path)
print(df0.head())

   Unnamed: 0                                              Claim  ...  ID     Label
0           0     Chris Hemsworth appeared in A Perfect Getaway.  ...   3  SUPPORTS
1           1                            Roald Dahl is a writer.  ...   7  SUPPORTS
2           2                          Roald Dahl is a governor.  ...   8   REFUTES
3           3        Ireland has relatively low-lying mountains.  ...   9  SUPPORTS
4           4  Ireland does not have relatively low-lying mou...  ...  10   REFUTES

[5 rows x 5 columns]


## Data preparation

As stated in [specifications.ipynb](specifications.ipynb), each dataset sample is comprised of:

*     A claim to verify
*     A set of semantically related statements (evidence set)
*     Fact checking label: either evidences support or refute the claim.

Handling the evidence set from the point of view of neural models may imply some additional complexity: if the evidence set is comprised of several sentences we might incur in memory problems.

To this end, we further simplify the problem by building (claim, evidence) pairs. The fact checking label is propagated as well.

Example:

     Claim: c1 
     Evidence set: [e1, e2, e3]
     Label: S (support)

--->

    (c1, e1, S),
    (c1, e2, S),
    (c1, e3, S)

So now we construct a new dataframe where each sample is in the form (Claim, Evidence, Lable).

In [5]:
# the evidences seems to be separated by a tab (\t) 
# so it may be used to split the Evidence string

# this dataset is a mess
dataset_path = os.path.join(os.getcwd(), 'dataset', 'train_pairs.csv')

def sentence_cleaning(sentence: str) -> str:
    """
    cleans up a sentence in the dataset using regular expressions
    :param sentence: the sentence to clean-up
    :return
        - string cleaned
    """
    
    # removes "-LRB-" and "-RRB-" strings and commas
    sentence = re.sub("-LRB-","", sentence)
    sentence = re.sub("-RRB-", "", sentence)
    sentence = re.sub(",", "", sentence)
    
    return sentence
    

def format_dataset(dataset: str, debug: bool = True) -> pd.DataFrame:
    """
    Reads out the csv file and returns a dataframe with 
    {Claim, Evidence, Label} row
    
    :param dataset: dataset csv file path
    :param debug: if True prints out data for debugging purposes
    
    :return
        - dataframe with (claim, evidence, label) rows
    """
    
    df = pd.read_csv(dataset)
    dataframe_rows = []
    df_size = df.shape[0]

    for i in range(0, df_size):

        claim = df["Claim"][i]
        label = df["Label"][i]
        ev_list = df["Evidence"][i].split('\t')

        evidence = ev_list[1]
        
        # create single dataframe row
        dataframe_row = {
            "Claim": claim,
            "Evidence": evidence,
            "Label": label
        }

        if debug: 
            print(claim)
            print(evidence)
            print(label)
        dataframe_rows.append(dataframe_row)

    df = pd.DataFrame(dataframe_rows)
    
    return df, dataframe_rows

df, df_rows = format_dataset(dataset_path, False)

Let's see how the resulting dataframe looks like

In [6]:
df.head()

Unnamed: 0,Claim,Evidence,Label
0,Chris Hemsworth appeared in A Perfect Getaway.,Hemsworth has also appeared in the science fic...,SUPPORTS
1,Roald Dahl is a writer.,Roald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈdɑː...,SUPPORTS
2,Roald Dahl is a governor.,Roald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈdɑː...,REFUTES
3,Ireland has relatively low-lying mountains.,The island 's geography comprises relatively l...,SUPPORTS
4,Ireland does not have relatively low-lying mou...,The island 's geography comprises relatively l...,REFUTES


Turning claims and Evidences into sequences of integers

In [7]:
def make_sequences(texts, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """
    Turn a set of texts into sequences of integers
    
    :param texts: the set of texts to turn into sequences
    :param lower: boolean. Whether to convert the texts to lowercase
    :param filters: a string where each element is a character that will be filtered from the texts
    """
    
    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)
    
    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts
    
    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)
    
    return word_idx, idx_word, num_words, word_counts, sequences

def encode_dataframe(dataframe):
    """
    Creates sequences of integers for both Claim and Evidence columns
    """
    
    # creates lists of claims and evidences
    claims = list(dataframe['Claim'])
    evidences = list(dataframe['Evidence'])
    
    cl_word_idx, cl_idx_word, cl_num_words, cl_word_counts, seq_claims = make_sequences(claims)
    ev_word_idx, ev_idx_word, ev_num_words, ev_word_counts, seq_evidences = make_sequences(evidences)
    
    return seq_claims, seq_evidences

seq_claims, seq_evidences = encode_dataframe(df)

In [8]:
n = 6

print(seq_claims[n])
print(len(seq_claims))

seq_claims[6][6]

[641, 196, 6, 151, 7337, 13, 2, 135, 5669, 4, 12729]
121740


2

Constructing the input matrices for the embedding step. We construct the matrices starting from `seq_claims` and `seq_evidences`. The number of rows of the matrices is the number of elements in `seq_claims` and `seq_evidences`, while the number of columns is the size of the longest sequences. The rows corresponding to shorter sequences are filled with zero-padding.

In [9]:
# computes the longest token sequence
def longest_seq(seq):
    seq_len = [len(i) for i in seq]
    
    return max(seq_len)

def matrix_from_sequences(sequences):
    """
    Builds a matrix of shape [batch_size, max_tokens]
    """
    
    max_tokens = longest_seq(sequences)
    seq_length = len(sequences)
    
    matrix = np.zeros((len(sequences), max_tokens), dtype=np.int32)
    
    for i in range(0, seq_length):
        for j in range(0, len(sequences[i])):
            matrix[i][j] = sequences[i][j]
    
    return matrix

claim_matrix = matrix_from_sequences(seq_claims)
evidence_matrix = matrix_from_sequences(seq_evidences)

print(claim_matrix.shape)
print(evidence_matrix.shape)

(121740, 65)
(121740, 126)


In [10]:
len(claim_matrix[1])

65

I dunno if this is correct

### Sentence Embedding

Let's reshape the claim and evidence matrixes into [batch_size, sequence_length, embedding_dim]. The embedding is created with keras, involving the similarity of the phrases. Check on https://keras.io/api/layers/core_layers/embedding/

In [11]:
def sentence_embedding(matrix, input_dim, output_dim, input_length):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim = input_dim, output_dim = output_dim, input_length = input_length))

  model.compile('rmsprop', 'mse')
  reshaped_matrix = model.predict(matrix)
  return reshaped_matrix, model

In [12]:
reshaped_claim_matrix , embed_claim = sentence_embedding(claim_matrix, 121740+1, 50, 65)

In [14]:
reshaped_evidence_matrix , embed_evidence = sentence_embedding(evidence_matrix, 121740+1, 50, 126)

In [17]:
embed_claim.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 50)            6087050   
                                                                 
Total params: 6,087,050
Trainable params: 6,087,050
Non-trainable params: 0
_________________________________________________________________
