# Task 1 - Retrieve Evidence

## 1. Prepare Data for Retrieval Task

## 1.1 Read Data from Task 0

In [2]:
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, DistilBertModel

In [3]:
data_tran_task0 = pd.read_json("../data/data_task0/df_tran_matched.json", orient='index')
data_vald_task0 = pd.read_json("../data/data_task0/df_vald_matched.json", orient='index')
data_test_task0 = pd.read_json("../data/data_task0/df_test_matched.json", orient='index')
data_evdn_task0 = pd.read_json("../data/data_task0/df_evdn.json", orient='index')

In [4]:
print("Original training data overview:")
display(data_tran_task0.head(3))
print("Original validation data overview:")
display(data_vald_task0.head(3))
print("Original test data overview:")
display(data_test_task0.head(3))
print("Original evidence data overview:")
display(data_evdn_task0.head(3))

Original training data overview:


Unnamed: 0,id,claim,label,evidences,evidences_matched,evidences_mismatched,evidences_missed
0,1937,Not only is there no scientific evidence that ...,DISPUTED,"[442946, 1194317, 12171]","[215, 315, 441, 526, 783, 968, 1018, 1135, 119...","[491521, 917507, 393220, 32773, 819207, 475144...",[]
1,126,El Niño drove record highs in global temperatu...,REFUTES,"[338219, 1127398]","[301, 441, 1018, 1135, 1236, 1579, 1758, 1823,...","[491521, 917507, 32773, 327686, 1130503, 47514...",[]
2,2510,"In 1946, PDO switched to a cool phase.",SUPPORTS,"[530063, 984887]","[74, 215, 226, 301, 357, 411, 552, 680, 694, 8...","[491521, 1146885, 655369, 688143, 1114128, 229...",[]


Original validation data overview:


Unnamed: 0,id,claim,label,evidences,evidences_matched,evidences_mismatched,evidences_missed
0,752,[South Australia] has the most expensive elect...,SUPPORTS,"[67732, 572512]","[16, 37, 89, 514, 608, 1001, 1291, 1457, 1577,...","[1114115, 933896, 786443, 917516, 16, 262162, ...",[]
1,375,when 3 per cent of total annual global emissio...,NOT_ENOUGH_INFO,"[996421, 1080858, 208053, 699212, 832334]","[16, 37, 410, 414, 441, 465, 514, 608, 783, 80...","[524291, 917507, 720899, 1114115, 786443, 16, ...",[]
2,1266,This means that the world is now 1C warmer tha...,SUPPORTS,"[889933, 694262]","[975, 1061, 1830, 2487, 4035, 4321, 5354, 6025...","[856065, 520194, 1036290, 1114114, 692229, 565...",[]


Original test data overview:


Unnamed: 0,id,claim,evidences_matched
0,2967,The contribution of waste heat to the global c...,"[1460, 8304, 11688, 13436, 19067, 22107, 26773..."
1,979,“Warm weather worsened the most recent five-ye...,"[10368, 13434, 16837, 19622, 21546, 27162, 289..."
2,1609,Greenland has only lost a tiny fraction of its...,"[5928, 10210, 19306, 29169, 32006, 44485, 4475..."


Original evidence data overview:


Unnamed: 0,evidence_id,evidence
0,0,"John Bennet Lawes, English entrepreneur and ag..."
1,1,Lindberg began his professional career at the ...
2,2,``Boston (Ladies of Cambridge)'' by Vampire We...


## 1.2 Positive Sampling and Negative Sampling

In [5]:
def sampling_pos_neg(data_origin, data_evdn, is_test=False):
    samples = []
    for idx, row in data_origin.iterrows():
        claim_id = row["id"]
        text = row['claim']
        # Positive samples
        evidences = row['evidences'] if not is_test else row['evidences_matched']
        for ev_id in evidences:
            if ev_id in data_evdn.index:
                ev_text = data_evdn.loc[ev_id, 'evidence']
                if is_test:
                    samples.append({'claim_id': claim_id, 'text': text, 'evidence_id': ev_id, 'evidence': ev_text})
                else:
                    samples.append({'claim_id': claim_id, 'text': text, 'evidence_id': ev_id, 'evidence': ev_text, 'label': 1})
        # Negative samples
        if not is_test:
            evidences_mismatched = row['evidences_mismatched']
            # For train/val, sample 5-10 negatives
            valid_ev_ids = [ev_id for ev_id in evidences_mismatched if ev_id in data_evdn.index]
            num_to_sample = min(len(valid_ev_ids), random.randint(5, 10))
            sampled_ev_ids = random.sample(valid_ev_ids, num_to_sample)
            for ev_id in sampled_ev_ids:
                ev_text = data_evdn.loc[ev_id, 'evidence']
                samples.append({'claim_id': claim_id, 'text': text, 'evidence_id': ev_id, 'evidence': ev_text, 'label': 0})
    return pd.DataFrame(samples)

In [6]:
# Create samples
df_tran = sampling_pos_neg(data_tran_task0, data_evdn_task0)
df_tran_pos_count = df_tran[df_tran['label'] == 1].shape[0]
df_tran_neg_count = df_tran[df_tran['label'] == 0].shape[0]

df_vald = sampling_pos_neg(data_vald_task0, data_evdn_task0)
df_vald_pos_count = df_vald[df_vald['label'] == 1].shape[0]
df_vald_neg_count = df_vald[df_vald['label'] == 0].shape[0]

df_test = sampling_pos_neg(data_test_task0, data_evdn_task0, is_test=True)

In [7]:
print("Training positive samples count:", df_tran_pos_count)
print("Training negative samples count:", df_tran_neg_count)
print("Training samples overview:")
display(df_tran[df_tran['claim_id'] == df_tran['claim_id'].iloc[0]])

print("Validation positive samples count:", df_vald_pos_count)
print("Validation negative samples count:", df_vald_neg_count)
print("Validation samples overview:")
display(df_vald[df_vald['claim_id'] == df_vald['claim_id'].iloc[0]])

print("Test samples count:", df_test.shape[0])
print("Test samples overview:")
display(df_test[df_test['claim_id'] == df_test['claim_id'].iloc[0]] )

Training positive samples count: 4122
Training negative samples count: 9177
Training samples overview:


Unnamed: 0,claim_id,text,evidence_id,evidence,label
0,1937,Not only is there no scientific evidence that ...,442946,At very high concentrations (100 times atmosph...,1
1,1937,Not only is there no scientific evidence that ...,1194317,Plants can grow as much as 50 percent faster i...,1
2,1937,Not only is there no scientific evidence that ...,12171,Higher carbon dioxide concentrations will favo...,1
3,1937,Not only is there no scientific evidence that ...,887285,While the principal greenhouse gas emission fr...,0
4,1937,Not only is there no scientific evidence that ...,683138,"(BBC) 4 April A new, detailed record of past c...",0
5,1937,Not only is there no scientific evidence that ...,272774,During times of intense precipitation (such as...,0
6,1937,Not only is there no scientific evidence that ...,900415,McKibben began his freelance writing career at...,0
7,1937,Not only is there no scientific evidence that ...,339682,Hamilton's general view about climate change i...,0
8,1937,Not only is there no scientific evidence that ...,318323,Certain agricultural demands may increase more...,0
9,1937,Not only is there no scientific evidence that ...,438019,The process involves reacting carbon dioxide w...,0


Validation positive samples count: 491
Validation negative samples count: 1163
Validation samples overview:


Unnamed: 0,claim_id,text,evidence_id,evidence,label
0,752,[South Australia] has the most expensive elect...,67732,[citation needed] South Australia has the high...,1
1,752,[South Australia] has the most expensive elect...,572512,"""South Australia has the highest power prices ...",1
2,752,[South Australia] has the most expensive elect...,966583,"Through the program, the organizations created...",0
3,752,[South Australia] has the most expensive elect...,235036,This power is normally generated at power plan...,0
4,752,[South Australia] has the most expensive elect...,482095,Though the country 's supply of electricity ne...,0
5,752,[South Australia] has the most expensive elect...,747324,Although Mew Mew Power has not been released t...,0
6,752,[South Australia] has the most expensive elect...,1090285,"Visitors from the USA, Brazil, Japan, China, A...",0
7,752,[South Australia] has the most expensive elect...,230559,"The first checkpoint was at Berri, South Austr...",0
8,752,[South Australia] has the most expensive elect...,783374,"A study done by Greenpeace International, the ...",0
9,752,[South Australia] has the most expensive elect...,710661,Due to Lesotho's economic and geographical rel...,0


Test samples count: 197278
Test samples overview:


Unnamed: 0,claim_id,text,evidence_id,evidence
0,2967,The contribution of waste heat to the global c...,1460,"In the case of a four-stroke Otto cycle, techn..."
1,2967,The contribution of waste heat to the global c...,8304,"As the Earth's climate warms, we are seeing ma..."
2,2967,The contribution of waste heat to the global c...,11688,In late November 2016 surveys of 62 reefs show...
3,2967,The contribution of waste heat to the global c...,13436,Gruen made contributions in a broad range of t...
4,2967,The contribution of waste heat to the global c...,19067,"However, during last two decades there has bee..."
...,...,...,...,...
173,2967,The contribution of waste heat to the global c...,1183635,The study noted the influence of Michael Crich...
174,2967,The contribution of waste heat to the global c...,1185839,"It could prove to be the most inexorable, howe..."
175,2967,The contribution of waste heat to the global c...,1198065,A report released in March 2012 by the Intergo...
176,2967,The contribution of waste heat to the global c...,1202564,In warmer climates no additional heat would be...


## 1.3 Torch Dataset and DataLoader

In [8]:
# Dataset class
class Task1Dataset(Dataset):
    def __init__(self, df, tokenizer=None):
        self.samples = df.to_dict('records')
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        if self.tokenizer:
            input_text = sample['text'] + " [SEP] " + sample['evidence']
            encoded = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
            # Flatten the tensors since return_tensors='pt' gives 1-dim tensors
            sample['input_ids'] = encoded['input_ids'].squeeze()
            sample['attention_mask'] = encoded['attention_mask'].squeeze()
        return sample

In [9]:
# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

# Create Datasets
dataset_tran = Task1Dataset(df_tran, tokenizer=tokenizer)
dataset_vald = Task1Dataset(df_vald, tokenizer=tokenizer)
dataset_test = Task1Dataset(df_test, tokenizer=tokenizer)



In [10]:
# Test the Dataset
sample_temp = dataset_tran[0]
print("Sample keys:", list(sample_temp.keys()))
print("Input text length:", len(sample_temp['text']))
print("Input IDs shape:", sample_temp['input_ids'].shape)
print("Attention mask shape:", sample_temp['attention_mask'].shape)
print("Label:", sample_temp['label'])

Sample keys: ['claim_id', 'text', 'evidence_id', 'evidence', 'label', 'input_ids', 'attention_mask']
Input text length: 152
Input IDs shape: torch.Size([512])
Attention mask shape: torch.Size([512])
Label: 1


In [11]:
# Dataloader
dataloader_tran = DataLoader(dataset_tran, batch_size=16, shuffle=True)
dataloader_vald = DataLoader(dataset_vald, batch_size=16, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=16, shuffle=False)

In [12]:
# Test the Dataloader
batch_temp = next(iter(dataloader_tran))
print("Batch keys:", list(batch_temp.keys()))
print("Batch input IDs shape:", batch_temp['input_ids'].shape)
print("Batch attention mask shape:", batch_temp['attention_mask'].shape)
print("Batch labels shape:", batch_temp['label'].shape)

Batch keys: ['claim_id', 'text', 'evidence_id', 'evidence', 'label', 'input_ids', 'attention_mask']
Batch input IDs shape: torch.Size([16, 512])
Batch attention mask shape: torch.Size([16, 512])
Batch labels shape: torch.Size([16])


## 2. Model Design

In [13]:
# DistilRoBERTa Classifier Model
class DistilRoBERTaClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(DistilRoBERTaClassifier, self).__init__()
        self.distilroberta = DistilBertModel.from_pretrained('distilroberta-base')
        self.classifier = nn.Linear(self.distilroberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilroberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Use the first token's representation
        logits = self.classifier(pooled_output)
        return logits

In [14]:
# Test the Model
model = DistilRoBERTaClassifier(num_labels=2)
outputs_temp = model(batch_temp['input_ids'], batch_temp['attention_mask'])
print("Model output shape:", outputs_temp.shape)

You are using a model of type roberta to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.

Model output shape: torch.Size([16, 2])


## 3. Model Training & Evaluation

In [15]:
def train_model(model, train_loader, val_loader, num_epochs=5, learning_rate=2e-5, device='cpu'):

    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(num_epochs):
        
        model.train()
        total_loss = 0

        # Iterate over training batches
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

        # Evaluate on validation set
        val_accuracy = evaluate_model(model, val_loader, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")

    return model

In [16]:
def evaluate_model(model, data_loader, device='cpu'):
    model.to(device)
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

In [17]:
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [19]:
# 训练模型
model = DistilRoBERTaClassifier(num_labels=2)
model = train_model(model, dataloader_tran, dataloader_vald, num_epochs=3, device=device)

You are using a model of type roberta to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.

KeyboardInterrupt: 