# Task 2 - Claim Verification

## 1. Prepare Data for Claim Verification Task

### 1.1 Load Data From Task 0 & Task 1

In [1]:
import random
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, DistilBertModel, DistilBertTokenizer
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')
PATH_PREFIX = "/content/drive/MyDrive/Colab Notebooks/Unimelb_COMP90042_Automated_Fact_Checking"

# PATH_PREFIX = ".."

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_tran_task1 = pd.read_json(PATH_PREFIX + "/data/data_task0/df_tran_matched.json", orient='index')
data_vald_task1 = pd.read_json(PATH_PREFIX + "/data/data_task0/df_vald_matched.json", orient='index')
data_test_task1 = pd.read_json(PATH_PREFIX + "/data/data_task1/data_task1_results.json", orient='records')
data_evdn_task1 = pd.read_json(PATH_PREFIX + "/data/data_task0/df_evdn.json", orient='index')

In [4]:
print("Original training data overview:")
display(data_tran_task1.head(3))
print("Original validation data overview:")
display(data_vald_task1.head(3))
print("Original test data overview:")
display(data_test_task1.head(3))
print("Original evidence data overview:")
display(data_evdn_task1.head(3))

Original training data overview:


Unnamed: 0,id,claim,label,evidences,evidences_matched,evidences_mismatched,evidences_missed
0,1937,Not only is there no scientific evidence that ...,DISPUTED,"[442946, 1194317, 12171]","[215, 315, 441, 526, 783, 968, 1018, 1135, 119...","[491521, 917507, 393220, 32773, 819207, 475144...",[]
1,126,El Niño drove record highs in global temperatu...,REFUTES,"[338219, 1127398]","[301, 441, 1018, 1135, 1236, 1579, 1758, 1823,...","[491521, 917507, 32773, 327686, 1130503, 47514...",[]
2,2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[530063, 984887]","[74, 215, 226, 301, 357, 411, 552, 680, 694, 8...","[491521, 1146885, 655369, 688143, 1114128, 229...",[]


Original validation data overview:


Unnamed: 0,id,claim,label,evidences,evidences_matched,evidences_mismatched,evidences_missed
0,752,[South Australia] has the most expensive elect...,SUPPORTS,"[67732, 572512]","[16, 37, 89, 514, 608, 1001, 1291, 1457, 1577,...","[1114115, 933896, 786443, 917516, 16, 262162, ...",[]
1,375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[996421, 1080858, 208053, 699212, 832334]","[16, 37, 410, 414, 441, 465, 514, 608, 783, 80...","[524291, 917507, 720899, 1114115, 786443, 16, ...",[]
2,1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[889933, 694262]","[975, 1061, 1830, 2487, 4035, 4321, 5354, 6025...","[856065, 520194, 1036290, 1114114, 692229, 565...",[]


Original test data overview:


Unnamed: 0,id,claim,evidences
0,21,"Sea level rise has been slow and a constant, p...","[57975, 624644, 400437, 1046718, 6590, 232879]"
1,28,"Volcanoes Melting West Antarctic Glaciers, Not...","[29077, 777151, 1196519, 806019, 106742, 7473]"
2,30,the bushfires [in Australia] were caused by ar...,"[780092, 968437, 958374, 811517, 975948, 351418]"


Original evidence data overview:


Unnamed: 0,evidence_id,evidence
0,0,"John Bennet Lawes, English entrepreneur and ag..."
1,1,Lindberg began his professional career at the ...
2,2,``Boston (Ladies of Cambridge)'' by Vampire We...


### 1.2 Prepare Data for Task 2

In [5]:
def prepare_data_for_verification(data_origin, data_evdn, is_test=False):
    samples = []
    for idx, row in data_origin.iterrows():
        claim_id = row["id"]
        text = row['claim']
        evidences = row['evidences']
        ev_texts = [data_evdn.loc[ev_id, 'evidence'] for ev_id in evidences if ev_id in data_evdn.index]
        concatenated_evidences = " [SEP] ".join(ev_texts)
        input_text = text + " [SEP] " + concatenated_evidences
        if is_test:
            samples.append({'claim_id': claim_id, 'text': input_text, 'original_label': None, 'label': None, 'evidences': evidences})
        else:
            original_label = row['label']
            if original_label == 'SUPPORTS':
                new_label = [1, 1]
            elif original_label == 'REFUTES':
                new_label = [-1, 1]
            elif original_label == 'DISPUTED':
                new_label = [0, 1]
            elif original_label == 'NOT_ENOUGH_INFO':
                new_label = [0, 0]
            else:
                new_label = [0, 0]
            samples.append({'claim_id': claim_id, 'text': input_text, 'original_label': original_label, 'label': new_label, 'evidences': evidences})
    return pd.DataFrame(samples)

In [6]:
df_tran_task2 = prepare_data_for_verification(data_tran_task1, data_evdn_task1, is_test=False)
df_vald_task2 = prepare_data_for_verification(data_vald_task1, data_evdn_task1, is_test=False)
df_test_task2 = prepare_data_for_verification(data_test_task1, data_evdn_task1, is_test=True)

In [7]:
print("Training data length:", len(df_tran_task2))
print("Training data overview:")
display(df_tran_task2.head(3))

print("Validation data length:", len(df_vald_task2))
print("Validation data overview:")
display(df_vald_task2.head(3))

print("Test data length:", len(df_test_task2))
print("Test data overview:")
display(df_test_task2.head(3))

Training data length: 1228
Training data overview:


Unnamed: 0,claim_id,text,original_label,label,evidences
0,1937,Not only is there no scientific evidence that ...,DISPUTED,"[0, 1]","[442946, 1194317, 12171]"
1,126,El Niño drove record highs in global temperatu...,REFUTES,"[-1, 1]","[338219, 1127398]"
2,2510,"In 1946, PDO switched to a cool phase. [SEP] T...",SUPPORTS,"[1, 1]","[530063, 984887]"


Validation data length: 154
Validation data overview:


Unnamed: 0,claim_id,text,original_label,label,evidences
0,752,[South Australia] has the most expensive elect...,SUPPORTS,"[1, 1]","[67732, 572512]"
1,375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[0, 0]","[996421, 1080858, 208053, 699212, 832334]"
2,1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[1, 1]","[889933, 694262]"


Test data length: 151
Test data overview:


Unnamed: 0,claim_id,text,original_label,label,evidences
0,21,"Sea level rise has been slow and a constant, p...",,,"[57975, 624644, 400437, 1046718, 6590, 232879]"
1,28,"Volcanoes Melting West Antarctic Glaciers, Not...",,,"[29077, 777151, 1196519, 806019, 106742, 7473]"
2,30,the bushfires [in Australia] were caused by ar...,,,"[780092, 968437, 958374, 811517, 975948, 351418]"


### 1.3 Create Torch Dataset and DataLoader

In [8]:
class Task2Dataset(Dataset):
    def __init__(self, df, tokenizer=None):
        self.samples = df.to_dict('records')
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        if self.tokenizer:
            input_text = sample['text']
            encoded = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
            sample['input_ids'] = encoded['input_ids'].squeeze()
            sample['attention_mask'] = encoded['attention_mask'].squeeze()
        if 'label' in sample and sample['label'] is not None:
            label = sample['label']
            if not isinstance(label, torch.Tensor):
                label = torch.tensor(label, dtype=torch.float)
            sample['label'] = label
        return {k: v for k, v in sample.items() if k not in ['text', 'evidences'] and v is not None}

In [9]:
# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create datasets
dataset_tran = Task2Dataset(df_tran_task2, tokenizer)
dataset_vald = Task2Dataset(df_vald_task2, tokenizer)
dataset_test = Task2Dataset(df_test_task2, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Create dataloaders
dataloader_tran = DataLoader(dataset_tran, batch_size=16, shuffle=True)
dataloader_vald = DataLoader(dataset_vald, batch_size=16, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=16, shuffle=False)

## 2. Model Design

In [11]:
class DistilRoBERTaDualOutput(nn.Module):
    def __init__(self, roberta_model_name='roberta-base'):
        super(DistilRoBERTaDualOutput, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.dropout = nn.Dropout(0.1)

        self.output1 = nn.Linear(self.roberta.config.hidden_size, 1)
        self.output2 = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)

        out1 = torch.tanh(self.output1(pooled_output))
        out2 = torch.sigmoid(self.output2(pooled_output))

        return out1, out2

## 3. Model Training & Evaluation

### 3.1 Model Training & Evaluation Functions

In [12]:
def predict_label(pred1, pred2):
    if pred2 < 0.5:
        return 'NOT_ENOUGH_INFO'
    else:
        if pred1 <= -0.5:
            return 'REFUTES'
        elif pred1 <= 0.5:
            return 'DISPUTED'
        else:
            return 'SUPPORTS'

In [13]:
def train_model(model, dataloader_tran, dataloader_vald, num_epochs=5, lr=1e-5, device=None, patience=5):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion1 = nn.MSELoss()
    criterion2 = nn.MSELoss()

    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_eval_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        all_train_preds1 = []
        all_train_preds2 = []
        all_train_original_labels = []

        for batch in dataloader_tran:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label']
            if labels is not None:
                labels = labels.to(device)
            original_labels = batch['original_label']

            optimizer.zero_grad()

            out1, out2 = model(input_ids, attention_mask)

            loss1 = criterion1(out1.squeeze(), labels[:, 0])
            loss2 = criterion2(out2.squeeze(), labels[:, 1])
            loss = loss1 + loss2

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            all_train_preds1.extend(out1.squeeze().cpu().detach().numpy())
            all_train_preds2.extend(out2.squeeze().cpu().detach().numpy())
            all_train_original_labels.extend(original_labels)

        avg_train_loss = total_train_loss / len(dataloader_tran)

        correct = 0
        for p1, p2, true_label in zip(all_train_preds1, all_train_preds2, all_train_original_labels):
            pred_label = predict_label(p1, p2)
            if pred_label == true_label:
                correct += 1
        train_accuracy = correct / len(all_train_preds1)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}')
        eval_loss, _, _, _, _, _, _, eval_acc = evaluate_model(model, dataloader_vald, device, criterion1, criterion2)
        print(f'Evaluation Loss: {eval_loss:.4f}, Evaluation Accuracy: {eval_acc:.4f}')

        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        print('-' * 50)

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return model

In [14]:
def evaluate_model(model, dataloader, device, criterion1, criterion2):
    model.eval()
    total_loss = 0
    all_preds1 = []
    all_preds2 = []
    all_labels1 = []
    all_labels2 = []
    all_original_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label']
            if labels is not None:
                labels = labels.to(device)
            original_labels = batch['original_label']

            out1, out2 = model(input_ids, attention_mask)

            loss1 = criterion1(out1.squeeze(), labels[:, 0])
            loss2 = criterion2(out2.squeeze(), labels[:, 1])
            loss = loss1 + loss2
            total_loss += loss.item()

            all_preds1.extend(out1.squeeze().cpu().numpy())
            all_preds2.extend(out2.squeeze().cpu().numpy())
            all_labels1.extend(labels[:, 0].cpu().numpy())
            all_labels2.extend(labels[:, 1].cpu().numpy())
            all_original_labels.extend(original_labels)

    avg_loss = total_loss / len(dataloader)

    mae1 = mean_absolute_error(all_labels1, all_preds1)
    mse1 = mean_squared_error(all_labels1, all_preds1)
    mae2 = mean_absolute_error(all_labels2, all_preds2)
    mse2 = mean_squared_error(all_labels2, all_preds2)

    correct = 0
    for p1, p2, true_label in zip(all_preds1, all_preds2, all_original_labels):
        pred_label = predict_label(p1, p2)
        if pred_label == true_label:
            correct += 1
    overall_acc = correct / len(all_preds1)

    return avg_loss, mae1, mse1, 0, mae2, mse2, 0, overall_acc

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Traing model
model = DistilRoBERTaDualOutput()
model = train_model(model, dataloader_tran, dataloader_vald, num_epochs=50, lr=1e-5)

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch 1/50
Training Loss: 0.7640, Training Accuracy: 0.1132
Evaluation Loss: 0.7486, Evaluation Accuracy: 0.1169
--------------------------------------------------
Epoch 2/50
Training Loss: 0.7316, Training Accuracy: 0.1262
Evaluation Loss: 0.7161, Evaluation Accuracy: 0.1169
--------------------------------------------------
Epoch 3/50
Training Loss: 0.6759, Training Accuracy: 0.1612
Evaluation Loss: 0.6366, Evaluation Accuracy: 0.1234
--------------------------------------------------
Epoch 4/50
Training Loss: 0.5060, Training Accuracy: 0.4992
Evaluation Loss: 0.5133, Evaluation Accuracy: 0.7013
--------------------------------------------------
Epoch 5/50
Training Loss: 0.3107, Training Accuracy: 0.7362
Evaluation Loss: 0.4624, Evaluation Accuracy: 0.6753
--------------------------------------------------
Epoch 6/50
Training Loss: 0.1881, Training Accuracy: 0.8274
Evaluation Loss: 0.4457, Evaluation Accuracy: 0.7208
--------------------------------------------------
Epoch 7/50
Train

### 3.3 Model Inference on Test Set

In [16]:
def predict(model, dataloader, device):
    model.eval()
    predictions = []
    claim_ids_list = []

    with torch.no_grad():
        for batch in dataloader:
            claim_id = batch['claim_id']
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            out1, out2 = model(input_ids, attention_mask)

            pred1 = out1.squeeze().cpu().numpy()
            pred2 = out2.squeeze().cpu().numpy()
            batch_preds = []
            for p1, p2 in zip(pred1, pred2):
                pred_label = predict_label(p1, p2)
                batch_preds.append(pred_label)

            predictions.extend(batch_preds)
            claim_ids_list.extend(claim_id)

    return claim_ids_list, predictions

In [17]:
test_claim_ids, test_predictions = predict(model, dataloader_test, device)

In [18]:
result = {}
test_claim_ids = [int(cid) for cid in test_claim_ids]
for claim_id, pred_label in zip(test_claim_ids, test_predictions):
    row = df_test_task2[df_test_task2['claim_id'] == claim_id].iloc[0]
    result[claim_id] = {
        "claim_text": row['text'],
        "claim_label": pred_label,
        "evidences": row['evidences']
    }

with open(PATH_PREFIX + '/data/data_task2/data_task2_results.json', 'w') as f:
    json.dump(result, f, indent=4)