### **Initialisation**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch torchvision transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m121.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [3]:
import json
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import transformers
from transformers import BertTokenizer
from transformers import BertModel
from collections import Counter, defaultdict
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import AdamW
import time
import copy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from torch.nn import functional as F
from collections import Counter

Load the data and specify the paths:

In [4]:
transformers.logging.set_verbosity_error()
path_prefix = '/content/drive/MyDrive/Colab Notebooks/Assignment3/' # <!IMPORTANT!> Please replace with your own path

train_path = path_prefix + 'project-data/train-claims.json'  
dev_path = path_prefix + 'project-data/dev-claims.json'
test_path = path_prefix + 'project-data/test-claims-unlabelled.json'
evidence_path = path_prefix + 'project-data/evidence.json'

train_claims = json.load(open(train_path))
dev_claims = json.load(open(dev_path))
test_claims = json.load(open(test_path))
evidences = json.load(open(evidence_path))

#dev_train_claims = {**dict(train_claims), **dict(dev_claims)}

### **Evidence Retrieval - Function Declarations**

In [5]:
random.seed(42)
evidence_key_prefix = 'evidence-'
er_result_filename = path_prefix + "evidence-retrieval-only-results.json"
er_model_params_filename = path_prefix + 'cfeverercls.dat'
claim_hard_negatives_filename = path_prefix + 'claim-hard-negative-evidences.json'

# ----------Hyperparameters of the entire pipeline----------
# --------------Evidence Retrieval--------------
d_bert_base = 768
gpu = 0
input_seq_max_len = 384
er_pos_neg_sample_ratio = 5
train_neg_cand_num = 5000
pre_select_evidence_num = 1000
loader_batch_size = 16
loader_worker_num = 2
num_epoch_pre = 1
num_epoch_hne = 13
hnm_threshold = 0.5
hnm_batch_size = 12
evidence_selection_threshold = 0.9
max_evi = 5
opti_lr_er_pre = 2e-5
opti_lr_er_hne = 2e-7
grad_step_period_pre = 4
grad_step_period_hne = 4
# ----------------------------------------------

Define Dataset for Evidence Retrieval:

In [6]:
class CFEVERERTrainDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Train, for the Evidence Retrieval task."""

    def __init__(self, claims, evidences_, tokenizer, max_len=input_seq_max_len, sample_ratio=er_pos_neg_sample_ratio, train_neg_cand_num=train_neg_cand_num):
        self.data_set = unroll_train_claim_evidences(claims, evidences_, sample_ratio=sample_ratio, train_neg_cand_num=train_neg_cand_num)
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_
        self.sample_ratio = sample_ratio
        self.train_neg_cand_num = train_neg_cand_num
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_set)

    def reset_data_hne(self, claim_hard_negative_evidences):
        self.data_set = unroll_train_claim_evidences_with_hne(self.claims, self.evidences, claim_hard_negative_evidences)

    def __getitem__(self, index):
        claim_id, evidence_id, label = self.data_set[index]

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(self.claims[claim_id]['claim_text'], self.evidences[evidence_id], 
                                                              return_tensors='pt', padding='max_length', truncation=True,
                                                              max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
    
        return seq, attn_masks, segment_ids, label

In [7]:
class CFEVERERTestDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Dev/Test, for the Evidence Retrieval task."""

    def __init__(self, claims, evidences_, tokenizer, max_len=input_seq_max_len, max_candidates=pre_select_evidence_num):
        self.data_set = unroll_test_claim_evidences(claims, evidences_, max_candidates=max_candidates)
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        claim_id, evidence_id = self.data_set[index]

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(self.claims[claim_id]['claim_text'], self.evidences[evidence_id], 
                                                              return_tensors='pt', padding='max_length', truncation=True,
                                                              max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
    
        return seq, attn_masks, segment_ids, claim_id, evidence_id

Generate/Pre-select evidence candidates for train/test, that will be forwarded to the BERT-based classifer for processing:

In [8]:
def unroll_train_claim_evidences(claims, evidences_, sample_ratio, train_neg_cand_num):
    """
    This function aims to define the train evidences for each claim, 
    unroll them into pairs, and return a list of claim-evidence pairs
    in the form of (claim_id, evidence_id, label).

    Rule: Includes all the positive evidences for each claim, and randomly
    sample negative evidences for each claim, within the chosen top TF-IDF
    cosine similarity range. Number of negative evidences is determined by 
    the sample_ratio.
    """
    st = time.time()

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(list(evidences_.values()) + [claims[c]["claim_text"] for c in claims])
    evidences_tfidf = vectorizer.transform(evidences_.values())

    train_claim_evidence_pairs = []

    for cid in claims:
        claim_tfidf = vectorizer.transform([claims[cid]['claim_text']])

        for train_evidence_id, label in generate_train_evidence_samples(evidences_, claims[cid]['evidences'], claim_tfidf, evidences_tfidf, sample_ratio, train_neg_cand_num):
            train_claim_evidence_pairs.append((cid, train_evidence_id, label))

    random.shuffle(train_claim_evidence_pairs)
    print(f"Finished unrolling train claim-evidence pairs in {time.time() - st} seconds.")

    return train_claim_evidence_pairs

In [9]:
def unroll_test_claim_evidences(claims, evidences_, max_candidates):
    """
    This function aims to define the evidences to be further processed
    by the BERT model for each test claim. The evidences are unrolled
    into pairs, and return a list of claim-evidence pairs in the form
    of (claim_id, evidence_id).

    Rule: Includes the top <max_candidates> evidences for each claim 
    based on the TF-IDF cosine similarity score with the corresponding
    claim.
    """
    st = time.time()

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(list(evidences_.values()) + [claims[c]["claim_text"] for c in claims])
    evidences_tfidf = vectorizer.transform(evidences_.values())

    test_claim_evidence_pairs = []
    for cid in claims:
        claim_tfidf = vectorizer.transform([claims[cid]['claim_text']])

        for test_evidence_id in generate_test_evidence_candidates(evidences_, evidences_tfidf, claim_tfidf, max_candidates):
            test_claim_evidence_pairs.append((cid, test_evidence_id))

    print(f"Finished unrolling test claim-evidence pairs in {time.time() - st} seconds.")

    return test_claim_evidence_pairs

In [10]:
def generate_train_evidence_samples(evidences_, claim_evidences, claim_tfidf, evidences_tfidf, sample_ratio, train_neg_cand_num):
    """
    Generate training samples for a given claim for the evidence retrieval task.
    :param evidences_: the full evidence set.
    :param claim_evidences: the ground truth evidence set for the claim. In the form of a list of evidence ids
    :param claim_tfidf: the tfidf vector for the claim text
    :param evidences_tfidf: the tfidf vectors for the entire evidence set
    :param sample_ratio: the ratio of positive to negative samples: neg/pos
    :param train_neg_cand_num: the top TF-IDF cosine similarity range which the negative samples are chosen from
    :return: a list of evidence samples zipped with the corresponding labels. - (evi id, label)
    """
    similarity = cosine_similarity(claim_tfidf, evidences_tfidf).squeeze()
    
    df = pd.DataFrame({"evidences": evidences_.keys(), "similarity": similarity}).sort_values(by=['similarity'], ascending=False)
    train_neg_candidates = df.iloc[:train_neg_cand_num]["evidences"].tolist()
    
    # Get positive samples
    samples = claim_evidences.copy()  # evidence ids

    # Get negative samples
    while len(samples) < math.ceil(len(claim_evidences) * (sample_ratio + 1)):
        neg_sample = train_neg_candidates[random.randint(0, len(train_neg_candidates) - 1)]  # random selection
        
        if neg_sample not in samples:
            samples.append(neg_sample)

    samples_with_labels = list(zip(samples, [1] * len(claim_evidences) + [0] * (len(samples) - len(claim_evidences))))

    return samples_with_labels

In [11]:
def generate_random_train_evidence_samples(evidences_, claim_evidences, sample_ratio):
    """
    Generate training samples for each of the claims for the evidence retrieval task.
    :param evidences_: the full evidence set.
    :param claim_evidences: the ground truth evidence set for the claim. In the form of a list of evidence ids
    :param sample_ratio: the ratio of positive to negative samples: neg/pos
    :return: a list of evidence samples zipped with the corresponding labels. - (evi id, label)
    """
        
    # Get positive samples
    samples = claim_evidences.copy()  # evidence ids

    # Get negative samples
    while len(samples) < math.ceil(len(claim_evidences) * (sample_ratio + 1)):
        neg_sample = evidence_key_prefix + str(random.randint(0, len(evidences_) - 1))  # random selection
        
        if neg_sample not in samples:
            samples.append(neg_sample)

    samples_with_labels = list(zip(samples, [1] * len(claim_evidences) + [0] * (len(samples) - len(claim_evidences))))

    return samples_with_labels

In [12]:
def generate_test_evidence_candidates(evidences_, evidences_tfidf, claim_tfidf, max_candidates):
    """
    :param evidences_: the full evidence set.
    :param evidences_tfidf: The tfidf matrix of the entire evidence set
    :param claim_tfidf: The tfidf vector of the query claim (also a matrix technically).
    :param max_candidates: Number of evidences to be selected for further processing.
    :return: a list of the selected evidences.
    """
    similarity = cosine_similarity(claim_tfidf, evidences_tfidf).squeeze()
    
    df = pd.DataFrame({"evidences": evidences_.keys(), "similarity": similarity}).sort_values(by=['similarity'], ascending=False)
    potential_relevant_evidences = df.iloc[:max_candidates]["evidences"].tolist()

    return potential_relevant_evidences

Evidence Retrieval Model (Claim-Evidence Pair Classifier):

In [13]:
class CFEVERERClassifier(nn.Module):
    def __init__(self):
        super(CFEVERERClassifier, self).__init__()

        # Instantiating BERT model object
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Classification layer
        # input dimension is 768 because [CLS] embedding has a dimension of 768, if bert base is used
        # output dimension is 1 because we're working with a binary classification problem - RELEVANT : NOT RELEVANT
        self.cls_layer = nn.Linear(d_bert_base, 1)

    def forward(self, seq, attn_masks, segment_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
            -segment_ids : Tensor of shape [B, T] containing token ids of segment embeddings (see BERT paper for more details)
        '''
        
        # Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert(seq, attention_mask=attn_masks, token_type_ids=segment_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state

        # Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        # Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

Train the Claim-Evidence Pair Classifier:

In [14]:
def train_evi_retrieval(net, loss_criterion, opti, train_loader, dev_loader, dev_claims, gpu, max_eps, grad_step_period):
    best_f1 = 0
    mean_losses = [0] * max_eps
    
    for ep in range(max_eps):
        net.train()  # Good practice to set the mode of the model
        st = time.time()
        opti.zero_grad()
        count = 0
        train_acc = 0
        
        for i, (seq, attn_masks, segment_ids, labels) in enumerate(train_loader):
            # Extracting the tokens ids, attention masks and token type ids
            seq, attn_masks, segment_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), segment_ids.cuda(gpu), labels.cuda(gpu)

            # Obtaining the logits from the model
            logits = net(seq, attn_masks, segment_ids)

            # Computing loss
            loss = loss_criterion(logits.squeeze(-1), labels.float())

            mean_losses[ep] += loss.item()
            count += 1
            train_acc += get_accuracy_from_logits(logits, labels)

            scaled_loss = loss / grad_step_period  # normalise loss, scale to larger batch size, as original batch size cannot be handled due to GPU limitation

            # Backpropagating the gradients, account for gradients
            scaled_loss.backward()

            if (i + 1) % grad_step_period == 0:
                # Optimization step, apply the gradients
                opti.step()

                # Reset/Clear gradients
                opti.zero_grad()

            if i % 100 == 0:
                print("Iteration {} of epoch {} complete. Time taken (s): {}".format(i, ep, (time.time() - st)))
                st = time.time()
        
        mean_losses[ep] /= count
        print(f"Epoch {ep} completed. Loss: {mean_losses[ep]}, Accuracy: {train_acc / count}.\n")
        
        if (ep + 1) % 1 == 0:
            dev_st = time.time()
            print("Evaluating on the dev set... (This might take a while)")
            f1, recall, precision, dev_loss = evaluate(net, dev_loader, dev_claims, loss_criterion, gpu)
            print("\nEpoch {} completed! Evaluation on dev set took {} seconds.\nDevelopment F1: {}; Development Recall: {}; Development Precision: {}; Dev Loss: {}".format(ep, time.time() - dev_st, f1, recall, precision, dev_loss))
            
            if f1 > best_f1:
                print("Best development f1 improved from {} to {}, saving model...\n".format(best_f1, f1))
                best_f1 = f1
                torch.save(net.state_dict(), er_model_params_filename)
            else:
                print()
    
    return mean_losses

In [15]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    preds = (probs > 0.5).long()
    acc = (preds.squeeze() == labels).float().mean()
    return acc

def get_probs_from_logits(logits):
    probs = torch.sigmoid(logits.unsqueeze(-1))

    return probs.squeeze()

In [16]:
def select_evi_df(df, threshold, max_evidences):
    """
    Selects the top <max_evidences> evidences from the 
    dataframe <df> with a probability higher than <threshold>.
    If no one satisifies the threshold, the evidence with the highest
    probability is selected.
    """
    
    max_prob_evi = df[df['probs'] == df['probs'].max()]

    df = df[df['probs'] > threshold].nlargest(max_evidences, "probs")

    if len(df) == 0:
        df = max_prob_evi

    return df

def predict_evi(net, dataloader, gpu, threshold=evidence_selection_threshold, max_evidences=max_evi, evaluate=False, evaluation_claims=None, loss_criterion=None):
    net.eval()

    claim_evidences = defaultdict(list)
    df = pd.DataFrame()
    mean_loss = 0

    with torch.no_grad():  # suspend grad track, save time and memory
        for seq, attn_masks, segment_ids, claim_ids, evidence_ids in dataloader:
            if evaluate and evaluation_claims is not None:
                labels = torch.tensor([1 if evidence_ids[i] in evaluation_claims[claim_ids[i]]['evidences'] else 0 for i in range(len(claim_ids))])
                labels = labels.cuda(gpu)

            seq, attn_masks, segment_ids = seq.cuda(gpu), attn_masks.cuda(gpu), segment_ids.cuda(gpu)
            logits = net(seq, attn_masks, segment_ids)
            probs = get_probs_from_logits(logits)

            if evaluate:
                mean_loss += loss_criterion(logits.squeeze(-1), labels.float()).item()
            
            df = pd.concat([df, pd.DataFrame({"claim_ids": claim_ids, "evidence_ids": evidence_ids, "probs": probs.cpu()})], ignore_index=True)

    # groupby gives a df for each claim_ids, then for each df, apply() the selection, finally reset_index to get rid of the multi-index
    filtered_claim_evidences_df = df.groupby('claim_ids').apply(lambda x: select_evi_df(x, threshold, max_evidences)).reset_index(drop=True)

    # with open(path_prefix + 'pred_probabilities.json', 'w') as f:
    #     json.dump(filtered_claim_evidences_df['probs'].to_dict(), f)

    for _, row in filtered_claim_evidences_df.iterrows():
        claim_id = row['claim_ids']
        evidence_id = row['evidence_ids']

        claim_evidences[claim_id].append(evidence_id)
    
    return claim_evidences if not evaluate else (claim_evidences, mean_loss / len(dataloader))

In [17]:
def evaluate(net, dataloader, dev_claims, loss_criterion, gpu):
    """
    Used to evaluate the dev set performance of the model.
    """
    claim_evidences, loss = predict_evi(net, dataloader, gpu, evaluate=True, evaluation_claims=dev_claims, loss_criterion=loss_criterion)

    fscores, recalls, precisions = [], [], []

    for claim_id, evidences in claim_evidences.items():
        e_true = dev_claims[claim_id]['evidences']
        recall = len([e for e in evidences if e in e_true]) / len(e_true)
        precision = len([e for e in evidences if e in e_true]) / len(evidences)
        fscore = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0.0

        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

    mean_f = np.mean(fscores if len(fscores) > 0 else [0.0])
    mean_recall = np.mean(recalls if len(recalls) > 0 else [0.0])
    mean_precision = np.mean(precisions if len(precisions) > 0 else [0.0])

    return mean_f, mean_recall, mean_precision, loss  # F1 Score, recall, precision, loss

In [18]:
def extract_er_result(claim_evidences, claims, filename=er_result_filename):
    """
    Extract the evidences from the claim_evidences dict and
    save the result to a json file. This step only considers
    the evidences for a claim, with no care to the labels.
    """
    extracted_claims = copy.deepcopy(claims)

    for c in extracted_claims:
        extracted_claims[c]["evidences"] = claim_evidences[c]
    
    with open(filename, 'w') as f:
        json.dump(extracted_claims, f)

    return extracted_claims

Create the evidence retrieval classifier and optimizer:

In [19]:
net_er = CFEVERERClassifier()
net_er.cuda(gpu) #Enable gpu support for the model

loss_criterion = nn.BCEWithLogitsLoss(weight=torch.tensor([er_pos_neg_sample_ratio])).cuda(gpu)
opti_er_pre = optim.Adam(net_er.parameters(), lr=opti_lr_er_pre)

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

### **Evidence Retrieval - Training**

This phrase trains the BERT model on claim-evidence pairs with all positive evidences for a claim and the same number of random negative evidences. For the pre-defined number of epochs. Evaluated on the dev set for each epoch, the best one will be chosen for HNM later and further fine-tuning.

Create the respective data loaders:

In [None]:
# Creating instances of training and development set
train_set = CFEVERERTrainDataset(train_claims, evidences, bert_tokenizer)
dev_set = CFEVERERTestDataset(dev_claims, evidences, bert_tokenizer, max_candidates=100)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size=loader_batch_size, num_workers=loader_worker_num)
dev_loader = DataLoader(dev_set, batch_size=loader_batch_size, num_workers=loader_worker_num)

Do training:

In [None]:
# First phrase: pre-train the model on all positive claim-evidence pairs and same number of random negative pairs
train_evi_retrieval(net_er, loss_criterion, opti_er_pre, train_loader, dev_loader, dev_claims, gpu, num_epoch_pre, grad_step_period_pre)

### **Evidence Retrieval Baseline Model (TFIDF)**

In [21]:
def tfidf_cos_baseline(claims, evidences, evidence_select_num=5):
    """
    Selects the K most cosine similar evidences based on TF-IDF.
    """
    fscores, recalls, precisions = [], [], []
    claim_evidences = {}

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(list(evidences.values()) + [claims[c]["claim_text"] for c in claims])
    evidences_tfidf = vectorizer.transform(evidences.values())

    for c in claims:
        claim_tfidf = vectorizer.transform([claims[c]["claim_text"]])

        cos_sims = cosine_similarity(claim_tfidf, evidences_tfidf).squeeze()
    
        df = pd.DataFrame({"evidences": evidences.keys(), "similarity": cos_sims}).sort_values(by=['similarity'], ascending=False)
        claim_evidences[c] = df.iloc[:evidence_select_num]["evidences"].tolist()
    
    for claim_id, evidences in claim_evidences.items():
        e_true = claims[claim_id]['evidences']
        recall = len([e for e in evidences if e in e_true]) / len(e_true)
        precision = len([e for e in evidences if e in e_true]) / len(evidences)
        fscore = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0.0

        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

    mean_f = np.mean(fscores if len(fscores) > 0 else [0.0])
    mean_recall = np.mean(recalls if len(recalls) > 0 else [0.0])
    mean_precision = np.mean(precisions if len(precisions) > 0 else [0.0])

    return mean_f, mean_recall, mean_precision  # F1 Score, recall, precision

In [22]:
f1, recall, precision = tfidf_cos_baseline(dev_claims, evidences)
print("------Evidence Retrival Baseline Performance------")
print(f"F1-Score: {f1}")
print(f"Recall-Score: {recall}")
print(f"Precision-Score: {precision}")
print("--------------------------------------------------")

------Evidence Retrival Baseline Performance------
F1-Score: 0.09012059369202229
Recall-Score: 0.14469696969696969
Precision-Score: 0.07272727272727274
--------------------------------------------------


### **Claim Label Classification - Function Declarations**

In [23]:
clc_model_params_filename = path_prefix + 'cfeverlabelcls.dat'

# ----------Hyperparameters of the entire pipeline----------
# --------------Claim Label Classification--------------
d_bert_base = 768
gpu = 0
input_seq_max_len = 384
loader_batch_size = 16
loader_worker_num = 2
num_epoch = 9
num_of_classes = 4
opti_lr_clc = 2e-5
label_mapper_ltoi = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2}
label_mapper_itol = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO'}
# ------------------------------------------------------

In [24]:
class CFEVERLabelTrainDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Training, for the Evidence Retrival task."""

    def __init__(self, claims, evidences_, max_len=input_seq_max_len):
        self.data_set = unroll_train_claim_evidence_pairs(claims)
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        claim_id, evidence_id, label = self.data_set[index]

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(self.claims[claim_id]['claim_text'], self.evidences[evidence_id], 
                                                              return_tensors='pt', padding='max_length', truncation=True,
                                                              max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
    
        return seq, attn_masks, segment_ids, label

In [25]:
def unroll_train_claim_evidence_pairs(claims):
    """
    Rule: 
    Current approach considers all evidences to be with the 
    label that the associated claim has, except for the DISPUTED label.
    """
    claim_evidence_pairs = []

    for claim_id in claims:
        if claims[claim_id]['claim_label'] != 'DISPUTED':
            for evidence_id in claims[claim_id]['evidences']:
                claim_evidence_pairs.append((claim_id, evidence_id, label_mapper_ltoi[claims[claim_id]['claim_label']]))
    
    return claim_evidence_pairs

In [26]:
class CFEVERLabelTestDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Testing, for the Evidence Retrival task."""

    def __init__(self, claims, evidences_, max_len=input_seq_max_len):
        self.data_set = unroll_test_claim_evidence_pairs(claims)
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        claim_id, evidence_id = self.data_set[index]

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(self.claims[claim_id]['claim_text'], self.evidences[evidence_id], 
                                                              return_tensors='pt', padding='max_length', truncation=True,
                                                              max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
    
        return seq, attn_masks, segment_ids, claim_id

In [27]:
def unroll_test_claim_evidence_pairs(claims):
    claim_evidence_pairs = []

    for claim_id in claims:
        for evidence_id in claims[claim_id]['evidences']:
            claim_evidence_pairs.append((claim_id, evidence_id))
    
    return claim_evidence_pairs

In [28]:
class CFEVERLabelClassifier(nn.Module):
    def __init__(self):
        super(CFEVERLabelClassifier, self).__init__()

        # Instantiating BERT model object
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Classification layer
        # input dimension is 768 because [CLS] embedding has a dimension of 768, if bert base is used
        # output dimension is 1 because we're working with a binary classification problem - RELEVANT : NOT RELEVANT
        self.cls_layer = nn.Linear(d_bert_base, num_of_classes)

    def forward(self, seq, attn_masks, segment_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
            -segment_ids : Tensor of shape [B, T] containing token ids of segment embeddings (see BERT paper for more details)
        '''
        
        # Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert(seq, attention_mask=attn_masks, token_type_ids=segment_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state

        # Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        # Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits  # logits shape is [B, num_of_classes]

In [29]:
def train_claim_cls(net, loss_criterion, opti, train_loader, dev_loader, dev_claims, gpu, max_eps=num_epoch):
    best_acc = 0
    mean_losses = [0] * max_eps

    for ep in range(max_eps):
        net.train()  # Good practice to set the mode of the model
        st = time.time()
        train_acc = 0
        count = 0
        
        for i, (b_seq, b_attn_masks, b_segment_ids, b_label) in enumerate(train_loader):
            # Reset/Clear gradients
            opti.zero_grad()

            # Extracting the tokens ids, attention masks and token type ids
            b_seq, b_attn_masks, b_segment_ids, b_label = b_seq.cuda(gpu), b_attn_masks.cuda(gpu), b_segment_ids.cuda(gpu), b_label.cuda(gpu)

            # Obtaining the logits from the model
            logits = net(b_seq, b_attn_masks, b_segment_ids)

            # Computing loss
            loss = loss_criterion(logits, b_label)

            mean_losses[ep] += loss.item()
            count += 1
            train_acc += get_accuracy_from_logits(logits, b_label)

            # Backpropagating the gradients, account for gradients
            loss.backward()

            # Optimization step, apply the gradients
            opti.step()

            if i % 100 == 0:
                print("Iteration {} of epoch {} complete. Time taken (s): {}".format(i, ep, (time.time() - st)))
                st = time.time()
        
        mean_losses[ep] /= count
        print(f"Epoch {ep} completed. Loss: {mean_losses[ep]}, Accuracy: {train_acc / count}.\n")

        dev_acc = evaluate_dev(net, dev_loader, dev_claims, gpu)
        print("\nEpoch {} complete! Development Accuracy on dev claim labels: {}.".format(ep, dev_acc))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...\n".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), clc_model_params_filename)
        else:
            print()
    
    return mean_losses

In [30]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=-1)
    predicted_classes = torch.argmax(probs, dim=1)
    acc = (predicted_classes.squeeze() == labels).float().mean()
    return acc

def get_predictions_from_logits(logits):
    probs = F.softmax(logits, dim=-1)
    predicted_classes = torch.argmax(probs, dim=1)
    return predicted_classes.squeeze()

In [31]:
def predict_pairs(net, dataloader, gpu):
    net.eval()

    claim_evidence_labels = defaultdict(list)
    df = pd.DataFrame()

    with torch.no_grad():
        for seq, attn_masks, segment_ids, claim_ids in dataloader:
            seq, attn_masks, segment_ids = seq.cuda(gpu), attn_masks.cuda(gpu), segment_ids.cuda(gpu)
            logits = net(seq, attn_masks, segment_ids)
            preds = get_predictions_from_logits(logits)

            df = pd.concat([df, pd.DataFrame({"claim_ids": claim_ids, "preds": preds.cpu()})], ignore_index=True)

    for _, row in df.iterrows():
        claim_id = row['claim_ids']
        label = row['preds']

        claim_evidence_labels[claim_id].append(label)
    
    return claim_evidence_labels

In [32]:
def decide_claim_labels_majority_vote(net, dataloader, gpu):
    """
    This function decides the final label for each claim
    based on the designed rules.

    Current Rule: Majority voting.
    """
        
    claim_evidence_labels = predict_pairs(net, dataloader, gpu)
    claim_labels = {}

    for claim_id in claim_evidence_labels:
        claim_labels[claim_id] = label_mapper_itol[Counter(claim_evidence_labels[claim_id]).most_common(1)[0][0]]  # label as the most common one - majority voting
    
    return claim_labels

In [33]:
def decide_claim_labels_rule_aggregation(net, dataloader, gpu):
    claim_evidence_labels = predict_pairs(net, dataloader, gpu)
    claim_labels = {}

    for claim_id in claim_evidence_labels:
        if len(set(claim_evidence_labels[claim_id])) == 1:
            claim_labels[claim_id] = label_mapper_itol[claim_evidence_labels[claim_id][0]]
        elif len(set(claim_evidence_labels[claim_id])) == 2:
            if label_mapper_ltoi['NOT_ENOUGH_INFO'] in claim_evidence_labels[claim_id]:
                claim_labels[claim_id] = label_mapper_itol[(set(claim_evidence_labels[claim_id]) - {label_mapper_ltoi['NOT_ENOUGH_INFO']}).pop()]  # label as the other one: supports/refutes
            else:
                claim_labels[claim_id] = "DISPUTED"
        else:  # len(set(claim_evidence_labels[claim_id])) == 3
            claim_labels[claim_id] = "DISPUTED"

    return claim_labels

In [34]:
def evaluate_dev(net, dataloader, dev_claims, gpu):
    claim_labels = decide_claim_labels_majority_vote(net, dataloader, gpu)

    correct_labels = 0

    for claim_id in dev_claims:
        if claim_labels[claim_id] == dev_claims[claim_id]["claim_label"]:
            correct_labels += 1
    
    return correct_labels / len(dev_claims)  # claim label accuracy

In [35]:
net_clc = CFEVERLabelClassifier()
net_clc.cuda(gpu) #Enable gpu support for the model

class_counts = defaultdict(int)
for cid in train_claims:
    if train_claims[cid]['claim_label'] != 'DISPUTED':
        class_counts[train_claims[cid]['claim_label']] += len(train_claims[cid]['evidences'])

class_weights = torch.tensor([(sum(class_counts.values()) / class_counts[c]) for c in label_mapper_ltoi.keys()])
loss_criterion = nn.CrossEntropyLoss()#weight=class_weights).cuda(gpu)
opti_clc = optim.Adam(net_clc.parameters(), lr=opti_lr_clc)

### **Claim Label Classification - Training**

Integrate noise evidences that inherits the evidence retrival model biases from the preceding task:

In [None]:
net_er.load_state_dict(torch.load(er_model_params_filename))

<All keys matched successfully>

Get predictions using the preceding ER model for training.

In [None]:
test_train_set = CFEVERERTestDataset(train_claims, evidences, bert_tokenizer, max_candidates=100)
test_dev_set = CFEVERERTestDataset(dev_claims, evidences, bert_tokenizer, max_candidates=100)

test_train_loader = DataLoader(test_train_set, batch_size=loader_batch_size, num_workers=loader_worker_num)
test_dev_loader = DataLoader(test_dev_set, batch_size=loader_batch_size, num_workers=loader_worker_num)

Finished unrolling test claim-evidence pairs in 1095.279700756073 seconds.
Finished unrolling test claim-evidence pairs in 184.49401569366455 seconds.


In [None]:
train_claim_evidences = predict_evi(net_er, test_train_loader, gpu)
dev_claim_evidences = predict_evi(net_er, test_dev_loader, gpu)

In [None]:
clc_train_claims = copy.deepcopy(train_claims)

for cid in clc_train_claims:
    clc_train_claims[cid]['evidences'].extend(train_claim_evidences[cid])
    clc_train_claims[cid]['evidences'] = list(set(clc_train_claims[cid]['evidences']))

clc_dev_claims = copy.deepcopy(dev_claims)

for cid in clc_dev_claims:
    clc_dev_claims[cid]['evidences'] = dev_claim_evidences[cid]

Create the respective data loaders:

In [None]:
train_set = CFEVERLabelTrainDataset(clc_train_claims, evidences)
dev_set = CFEVERLabelTestDataset(clc_dev_claims, evidences)

train_loader = DataLoader(train_set, batch_size=loader_batch_size, num_workers=loader_worker_num)
dev_loader = DataLoader(dev_set, batch_size=loader_batch_size, num_workers=loader_worker_num)

Do Training:

In [None]:
train_claim_cls(net_clc, loss_criterion, opti_clc, train_loader, dev_loader, clc_dev_claims, gpu)

### **Claim Label Classification - Zero R Baseline**

In [36]:
def zero_r_label_cls_baseline(train_claims, dev_claims):
    acc = 0
    majority_label = Counter([train_claims[c]["claim_label"] for c in train_claims]).most_common(1)[0][0]

    for c in dev_claims:
        if dev_claims[c]['claim_label'] == majority_label:
            acc += 1
    
    return acc / len(dev_claims)

In [37]:
acc = zero_r_label_cls_baseline(train_claims, dev_claims)
print("------Label Classification Baseline Performance------")
print(f"Accuracy: {acc}")
print("-----------------------------------------------------")

------Label Classification Baseline Performance------
Accuracy: 0.44155844155844154
-----------------------------------------------------


### **Predict evidences and labels for test claims**

In [None]:
output_filename = path_prefix + 'test-claims-predictions.json'

In [None]:
def extract_claim_evi_labels(test_claims, claim_labels):
    for claim in claim_labels:
        test_claims[claim]["claim_label"] = claim_labels[claim]
    
    with open(output_filename, 'w') as f:
        json.dump(test_claims, f)
    
    print("Final test claims predictions file ready.")
    
    return test_claims

In [None]:
net_er.load_state_dict(torch.load(er_model_params_filename))

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
test_set_er = CFEVERERTestDataset(test_claims, evidences, bert_tokenizer)
test_loader_er = DataLoader(test_set_er, batch_size=loader_batch_size, num_workers=loader_worker_num)

In [None]:
test_claims = extract_er_result(predict_evi(net_er, test_loader_er, gpu), test_claims)

In [None]:
net_clc.load_state_dict(torch.load(clc_model_params_filename))

<All keys matched successfully>

In [None]:
test_set_clc = CFEVERLabelTestDataset(test_claims, evidences)
test_loader_clc = DataLoader(test_set_clc, batch_size=loader_batch_size, num_workers=loader_worker_num)

In [None]:
claim_labels = decide_claim_labels_majority_vote(net_clc, test_loader_clc, gpu)
extract_claim_evi_labels(test_claims, claim_labels)

### **[Optional] Evidence Retrieval - Hard Negative Mining**

In [None]:
def unroll_train_claim_evidences_with_hne(claims, evidences_, claim_hard_negative_evidences, hne_sample_ratio=0.5):
    st = time.time()

    train_claim_evidence_pairs = []

    for claim in claims:
        for train_evidence_id, label in generate_random_train_evidence_samples(evidences_, claims[claim]['evidences'], hne_sample_ratio):
            train_claim_evidence_pairs.append((claim, train_evidence_id, label))

        for train_evidence_id in claim_hard_negative_evidences[claim]:
            train_claim_evidence_pairs.append((claim, train_evidence_id, 0))

    random.shuffle(train_claim_evidence_pairs)
    print(f"Finished unrolling train claim-evidence pairs with hne in {time.time() - st} seconds.")

    return train_claim_evidence_pairs

In [None]:
class CFEVERERHNMDataset(Dataset):
    """
    This dataset is used to obtain the hard negative evidences for a given claim
    for a pre-trained ER model. All evidences that are not positive for the claim
    are considered in the dataset.

    Note: This dataset only takes one claim instead of all like in the normal train
    dataset above. Because hard negative evidences are selected for a claim at a time.
    """
    def __init__(self, claim, evidences_, tokenizer, max_len=input_seq_max_len):
        self.data_set = [e for e in evidences_ if e not in claim['evidences']]  # get all negative samples
        self.max_len = max_len
        self.claim = claim
        self.evidences = evidences_
        self.target_hn_num = len(claim['evidences'])  # number of hard negative evidences to be selected
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        evidence_id = self.data_set[index]

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(self.claim['claim_text'], self.evidences[evidence_id], 
                                                              return_tensors='pt', padding='max_length', truncation=True,
                                                              max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
    
        return seq, attn_masks, segment_ids, evidence_id

In [None]:
def hnm(net, train_claims, evidences_, tokenizer, gpu, hnm_threshold=hnm_threshold, hnm_batch_size=hnm_batch_size):
    """
    This function aims to select the hard negative evidences for each claim.
    returns a dict of claim_id -> list of hard negative evidences.
    """
    net.eval()
    st = time.time()

    claim_hard_negative_evidences = defaultdict(list)  # store the hard negative evidences for each claim
    
    for k, train_claim in enumerate(train_claims):  # for each claim in the training set
        test_train_set = CFEVERERHNMDataset(train_claims[train_claim], evidences_, tokenizer)  # get the dataset containing the negative evi for the claim
        test_train_loader = DataLoader(test_train_set, batch_size=hnm_batch_size, num_workers=loader_worker_num)

        with torch.no_grad():  # suspend grad track, save time and memory
            for seq, attn_masks, segment_ids, evidence_ids in test_train_loader:  
                seq, attn_masks, segment_ids = seq.cuda(gpu), attn_masks.cuda(gpu), segment_ids.cuda(gpu)
                logits = net(seq, attn_masks, segment_ids)
                probs = get_probs_from_logits(logits)

                indices = np.where(probs.cpu().numpy() > hnm_threshold)[0]  # get the indices of the hard negative evidences if any
                i = 0

                while len(claim_hard_negative_evidences[train_claim]) < test_train_set.target_hn_num and i < len(indices):
                    """While the number of hard negative evidences for the claim is less than the target number,
                    and there are still hard negative evidences in the indices, add the evidences to the list."""
                    claim_hard_negative_evidences[train_claim].append(evidence_ids[indices[i]])
                    i += 1

                if len(claim_hard_negative_evidences[train_claim]) == test_train_set.target_hn_num:  # if the enough hard negatives, break
                    break
        
        if k % 50 == 0:
            print(f"{k}th claim finished in {time.time() - st} seconds.")
            st = time.time()
    
    with open(claim_hard_negatives_filename, 'w') as f:
        json.dump(claim_hard_negative_evidences, f)
        print("\nClaim hard negative evidences saved to file.")

    return claim_hard_negative_evidences

In [None]:
net_er.load_state_dict(torch.load(er_model_params_filename))  # load the best model
opti_er_hne = AdamW(net_er.parameters(), lr=opti_lr_er_hne, weight_decay=0.15)



In [None]:
#claim_hard_negative_evidences = json.load(open(claim_hard_negatives_filename, 'r'))
claim_hard_negative_evidences = hnm(net_er, train_claims, bert_tokenizer, evidences, gpu)

In [None]:
train_set.reset_data_hne(claim_hard_negative_evidences)
train_loader = DataLoader(train_set, batch_size=loader_batch_size, num_workers=loader_worker_num)

**Evidence Retrieval - Training (Phrase 2)**

In [None]:
train_evi_retrieval(net_er, loss_criterion, opti_er_hne, train_loader, dev_loader, dev_claims, gpu, num_epoch_hne, grad_step_period_hne)

### **[Optional] Claim Label Classification - Function Declarations | Alternative Version: Evidence Concatenation**

In [38]:
clc_model_params_filename = path_prefix + 'cfeverlabelcls.dat'

# ----------Hyperparameters of the entire pipeline----------
# --------------Claim Label Classification--------------
d_bert_base = 768
gpu = 0
input_seq_max_len = 512
loader_batch_size = 24
loader_worker_num = 2
num_epoch = 10
max_evi_num = 5
num_of_classes = 4
opti_lr_clc = 2e-5
label_mapper_ltoi = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
label_mapper_itol = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO', 3: 'DISPUTED'}
# ------------------------------------------------------

In [39]:
class CFEVERLabelTrainDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Training, for the Evidence Retrival task."""

    def __init__(self, claims, evidences_, max_len=input_seq_max_len):
        self.data_set = [claims[c] for c in claims]
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        claim = self.data_set[index]

        claim_evidences = claim['evidences']
        random.shuffle(claim_evidences)
        evidences_combined = " ".join([self.evidences[eid] for eid in claim_evidences])

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(claim['claim_text'], evidences_combined, 
                                                            return_tensors='pt', padding='max_length', truncation=True,
                                                            max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)

        return seq, attn_masks, segment_ids, label_mapper_ltoi[claim['claim_label']]

In [40]:
class CFEVERLabelTestDataset(Dataset):
    """Climate Fact Extraction and Verification Dataset for Testing, for the Evidence Retrival task."""

    def __init__(self, claims, evidences_, max_len=input_seq_max_len):
        self.data_set = [(c, claims[c]) for c in claims]
        self.max_len = max_len
        self.claims = claims
        self.evidences = evidences_

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, index):
        claim_id, claim = self.data_set[index]

        claim_evidences = claim['evidences']
        random.shuffle(claim_evidences)
        evidences_combined = " ".join([self.evidences[eid] for eid in claim_evidences])

        # Preprocessing the text to be suitable for BERT
        claim_evidence_in_tokens = self.tokenizer.encode_plus(claim['claim_text'], evidences_combined, 
                                                            return_tensors='pt', padding='max_length', truncation=True,
                                                            max_length=self.max_len, return_token_type_ids=True)
        
        seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
                'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)

        return seq, attn_masks, segment_ids, claim_id

In [41]:
class CFEVERLabelClassifier(nn.Module):
    def __init__(self):
        super(CFEVERLabelClassifier, self).__init__()

        # Instantiating BERT model object
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Classification layer
        # input dimension is 768 because [CLS] embedding has a dimension of 768, if bert base is used
        # output dimension is 1 because we're working with a binary classification problem - RELEVANT : NOT RELEVANT
        self.cls_layer = nn.Linear(d_bert_base, num_of_classes)

    def forward(self, seq, attn_masks, segment_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
            -segment_ids : Tensor of shape [B, T] containing token ids of segment embeddings (see BERT paper for more details)
        '''
        
        # Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert(seq, attention_mask=attn_masks, token_type_ids=segment_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state

        # Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        # Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits  # logits shape is [B, num_of_classes]

In [42]:
def train_claim_cls(net, loss_criterion, opti, train_loader, dev_loader, dev_claims, gpu, max_eps=num_epoch):
    best_acc = 0
    mean_losses = [0] * max_eps

    for ep in range(max_eps):
        net.train()  # Good practice to set the mode of the model
        st = time.time()
        train_acc = 0
        count = 0
        
        for i, (b_seq, b_attn_masks, b_segment_ids, b_label) in enumerate(train_loader):
            # Reset/Clear gradients
            opti.zero_grad()

            # Extracting the tokens ids, attention masks and token type ids
            b_seq, b_attn_masks, b_segment_ids, b_label = b_seq.cuda(gpu), b_attn_masks.cuda(gpu), b_segment_ids.cuda(gpu), b_label.cuda(gpu)

            # Obtaining the logits from the model
            logits = net(b_seq, b_attn_masks, b_segment_ids)

            # Computing loss
            loss = loss_criterion(logits, b_label)

            mean_losses[ep] += loss.item()
            count += 1
            train_acc += get_accuracy_from_logits(logits, b_label)

            # Backpropagating the gradients, account for gradients
            loss.backward()

            # Optimization step, apply the gradients
            opti.step()

            if i % 100 == 0:
                print("Iteration {} of epoch {} complete. Time taken (s): {}".format(i, ep, (time.time() - st)))
                st = time.time()
        
        mean_losses[ep] /= count
        print(f"Epoch {ep} completed. Loss: {mean_losses[ep]}, Accuracy: {train_acc / count}.")

        dev_acc = evaluate_dev(net, dev_loader, dev_claims, gpu)
        print("\nEpoch {} complete! Development Accuracy on dev claim labels: {}.".format(ep, dev_acc))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...\n".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), clc_model_params_filename)
        else:
            print()

    return mean_losses

In [43]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=-1)
    predicted_classes = torch.argmax(probs, dim=1)
    acc = (predicted_classes.squeeze() == labels).float().mean()
    return acc


def get_predictions_from_logits(logits):
    probs = F.softmax(logits, dim=-1)
    predicted_classes = torch.argmax(probs, dim=1)
    return predicted_classes.squeeze()

In [44]:
def predict(net, dataloader, gpu):
    net.eval()

    claim_labels = {}
    df = pd.DataFrame()

    with torch.no_grad():
        for b_seq, b_attn_masks, b_segment_ids, b_claim_id in dataloader:
            b_seq, b_attn_masks, b_segment_ids, = b_seq.cuda(gpu), b_attn_masks.cuda(gpu), b_segment_ids.cuda(gpu)
            logits = net(b_seq, b_attn_masks, b_segment_ids)

            preds = get_predictions_from_logits(logits)
            df = pd.concat([df, pd.DataFrame({'claim_ids': b_claim_id, 'preds': preds.cpu()})], ignore_index=True)

    for _, row in df.iterrows():
        claim_id = row['claim_ids']
        label = row['preds']

        claim_labels[claim_id] = label_mapper_itol[label]
    
    return claim_labels

In [45]:
def evaluate_dev(net, dataloader, dev_claims, gpu):
    claim_labels = predict(net, dataloader, gpu)

    correct_labels = 0

    for claim_id in dev_claims:
        if claim_labels[claim_id] == dev_claims[claim_id]["claim_label"]:
            correct_labels += 1
    
    return correct_labels / len(dev_claims)  # claim label accuracy

In [46]:
def extract_claim_evi_labels(test_claims, claim_labels, output_filename):
    for claim in claim_labels:
        test_claims[claim]["claim_label"] = claim_labels[claim]
    
    with open(output_filename, 'w') as f:
        json.dump(test_claims, f)
    
    print("Final test claims predictions file ready.")
    
    return test_claims

In [47]:
net_clc = CFEVERLabelClassifier()
net_clc.cuda(gpu) # Enable gpu support for the model

class_counts = Counter([train_claims[claim]["claim_label"] for claim in train_claims])
class_weights = torch.tensor([(sum(class_counts.values()) / class_counts[c]) for c in label_mapper_ltoi.keys()])
loss_criterion = nn.CrossEntropyLoss(weight=class_weights).cuda(gpu)
opti_clc = optim.Adam(net_clc.parameters(), lr=opti_lr_clc)