In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import pandas as pd

tqdm.pandas()
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
model.to(device)

cuda


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [2]:
import json
from transformers import pipeline
from datasets import load_dataset


#inspired by the oracle code
#https://colab.research.google.com/drive/1gZJCakmY28cKGMj8B7wd1GUM3r72pdbi?usp=sharing#scrollTo=8eKFjiC3i8Yx
# Function to get labels and claims from the dataset

train_df = pd.read_json('train_claims_quantemp.json')
val_df = pd.read_json('val_claims_quantemp.json')
test_df = pd.read_json('test_claims_quantemp.json')

train_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in train_df.to_dict(orient='records')])
val_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in val_df.to_dict(orient='records')])
test_df = pd.DataFrame([{'label': item['label'], 'claim': item['claim']} for item in test_df.to_dict(orient='records')])

#old evidence file for bm25
#evidence_df = pd.read_json('bm25_top_100_claimdecomp.json')
#evidence_df = pd.DataFrame([{'claim': item['claim'], 'docs': item['docs'], 'scores': item['scores']} for item in evidence_df.to_dict(orient='records')])

#load csv evidence files for training and validation
train_evidence_df = pd.read_csv('NLP_Group16/evidences_train.csv')
val_evidence_df = pd.read_csv('NLP_Group16/evidences_val.csv')
train_evidence_df = train_evidence_df[['claim', 'evidences', 'scores']]
val_evidence_df = val_evidence_df[['claim', 'evidences', 'scores']]


# Get top_k relevant evidence for each claim
def combine_top_k_evidence(row, top_k):
    evidences = row['evidences'].strip('[]').split(', ')
    scores = list(map(float, row['scores'].strip('[]').split(', ')))
    ranked_documents = sorted(zip(evidences, scores), key=lambda x: x[1], reverse=True)
    top_k_documents = [doc.strip('"') for doc, score in ranked_documents[:top_k]]
    return ' '.join(top_k_documents)

top_k = 5
## Apply the function to get top_k evidence for each claim
train_evidence_df['top_k_docs'] = train_evidence_df.apply(lambda row: combine_top_k_evidence(row, top_k), axis=1)
val_evidence_df['top_k_docs'] = val_evidence_df.apply(lambda row: combine_top_k_evidence(row, top_k), axis=1)

# Drop docs and scores columns as they are no longer needed
train_evidence_df = train_evidence_df.drop(columns=['evidences', 'scores'])
val_evidence_df = val_evidence_df.drop(columns=['evidences', 'scores'])

# Merge the combined evidence with the train, validation, and test DataFrames
train_df = train_df.merge(train_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
val_df = val_df.merge(val_evidence_df[['claim', 'top_k_docs']], on='claim', how='left')
#test_df = test_df.merge(evidence_df[['claim', 'top_k_docs']], on='claim', how='left')

In [3]:
def tokenize_data(df, max_length = 256):
    tokenized_inputs = tokenizer(
        df['claim'].tolist(),
        df['top_k_docs'].fillna('').tolist(),  # Fill NaN with empty strings
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    labels = df['label'].apply(lambda x: {'True': 0, 'False': 1, 'Conflicting': 2}[x]).tolist()
    return tokenized_inputs, labels

train_tokenized, train_labels = tokenize_data(train_df)
val_tokenized, val_labels = tokenize_data(val_df)
test_tokenized, test_labels = tokenize_data(test_df)

print(train_tokenized['input_ids'][0])

KeyError: 'top_k_docs'

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class ClaimsDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = torch.tensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = ClaimsDataset(train_tokenized, train_labels)
val_dataset = ClaimsDataset(val_tokenized, val_labels)
#test_dataset = ClaimsDataset(test_tokenized, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, pin_memory=True)
#test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, pin_memory=True)

In [None]:
from transformers import Trainer, TrainingArguments, AdamW
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

optimizer = AdamW(model.parameters(), lr=2e-5)

output_dir = './results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output directory is set to: {output_dir}")

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    
    metrics = {}
    
    for label_name, label_id in {'True': 0, 'False': 1, 'Conflicting': 2}.items():
        precision, recall, f1, support = precision_recall_fscore_support(labels, preds, labels=[label_id], average='weighted')
        metrics[f'{label_name}_precision'] = precision[0]
        metrics[f'{label_name}_recall'] = recall[0]
        metrics[f'{label_name}_f1'] = f1[0]
        metrics[f'{label_name}_support'] = support[0]

    #overall accuracy
    metrics['accuracy'] = accuracy_score(labels, preds)
    return metrics

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps = 5000,
    use_cpu=False,
    dataloader_pin_memory=False
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics
    )

trainer.train()


Output directory is set to: ./results


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/2484 [00:00<?, ?it/s]

{'loss': 1.2931, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.0}
{'loss': 1.1886, 'learning_rate': 3.2000000000000003e-06, 'epoch': 0.01}
{'loss': 0.9793, 'learning_rate': 5.2e-06, 'epoch': 0.01}
{'loss': 1.2977, 'learning_rate': 7.2000000000000005e-06, 'epoch': 0.02}
{'loss': 1.0203, 'learning_rate': 9e-06, 'epoch': 0.02}
{'loss': 1.0908, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.02}
{'loss': 0.9977, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}
{'loss': 0.9946, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.03}
{'loss': 1.2494, 'learning_rate': 1.7e-05, 'epoch': 0.04}
{'loss': 0.9635, 'learning_rate': 1.9e-05, 'epoch': 0.04}
{'loss': 0.8767, 'learning_rate': 1.995805369127517e-05, 'epoch': 0.04}
{'loss': 0.9054, 'learning_rate': 1.9874161073825505e-05, 'epoch': 0.05}
{'loss': 0.9843, 'learning_rate': 1.979026845637584e-05, 'epoch': 0.05}
{'loss': 1.109, 'learning_rate': 1.9706375838926174e-05, 'epoch': 0.06}
{'loss': 0.9737, 'learning_rate': 1.96224832

TrainOutput(global_step=2484, training_loss=0.8732805134977504, metrics={'train_runtime': 5747.3418, 'train_samples_per_second': 1.729, 'train_steps_per_second': 0.432, 'train_loss': 0.8732805134977504, 'epoch': 1.0})

In [None]:
results = trainer.evaluate(test_dataset)

results_file = os.path.join(training_args.output_dir, "evaluation_results.txt")
with open(results_file, "w") as writer:
    for key, value in results.items():
        writer.write(f"{key}: {value}\n")

print(results)


  0%|          | 0/624 [00:00<?, ?it/s]

{'eval_loss': 0.9082304239273071, 'eval_accuracy': 0.5627254509018036, 'eval_precision': 0.5666230665846471, 'eval_recall': 0.5627254509018036, 'eval_f1': 0.5640043334371488, 'eval_support': None, 'eval_runtime': 73.8159, 'eval_samples_per_second': 33.8, 'eval_steps_per_second': 8.453, 'epoch': 1.0}
