# 0. Configuration

In [1]:
MODEL_NAME = "nielsr/nt5-small-rc1"

#BASE_LOCATION = "/content/drive/My Drive/"
TRAIN_LOCATION = "evidences_train.csv"
VAL_LOCATION =  "evidences_val.csv"

LEARNING_RATE = 3e-5
BATCH_SIZE = 4
NUM_EPOCHS = 1

# 1. Install dependencies

# 2. Load model

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 3. Load dataset

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
import pandas as pd
from datasets import Dataset
from ast import literal_eval

train_dataset = pd.read_csv(TRAIN_LOCATION, index_col=0)
train_dataset['evidences'] = train_dataset['evidences'].apply(literal_eval)
train_dataset['scores'] = train_dataset['scores'].apply(literal_eval)
train_dataset = Dataset.from_pandas(train_dataset)

val_dataset = pd.read_csv(VAL_LOCATION, index_col=0)
val_dataset['evidences'] = val_dataset['evidences'].apply(literal_eval)
val_dataset['scores'] = val_dataset['scores'].apply(literal_eval)
val_dataset = Dataset.from_pandas(val_dataset)

# 4. Preprocess data

In [5]:
def generate_prompt(claim, evidences):
    prompt_begin = (
        f"You are a fact-checker. Your task is to classify the claim based on the following evidence.\n\n"
        f"Claim: {claim}\n\n"
        f"Evidences:\n"
    )
    prompt_evidences = f""
    prompt_end = f"\n\nLabel (TRUE, FALSE, CONFLICTING):"

    for k in range(0, min(5, len(evidences))):
        new_evidence = f"\n{k + 1}. {evidences[k][0]} (score: {round(evidences[k][1], 2)})"
        tokens = tokenizer(prompt_begin + prompt_evidences + new_evidence + prompt_end, max_length=512, truncation=True, return_tensors="pt")
        if tokens['input_ids'].shape[1] < tokenizer.model_max_length:
            prompt_evidences += new_evidence
        else:
            break
    return prompt_begin + prompt_evidences + prompt_end

def preprocess_function(examples):
    inputs = [generate_prompt(examples['claim'][i], list(zip(examples['evidences'][i], examples['scores'][i]))) for i in range(len(examples["claim"]))]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["label"], max_length=16, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_validation = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

Map: 100%|█████████████████████████████████████████████████████████████████| 9935/9935 [01:18<00:00, 126.32 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 3084/3084 [00:23<00:00, 129.03 examples/s]


# 4. Prepare training

In [7]:
from datasets import load_metric
import numpy as np

metric = load_metric("f1")

def to_number(label):
    if label == "TRUE":
        return 0
    elif label == "FALSE":
        return 1
    elif label == "CONFLICTING":
        return 2
    else:
        return 3

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [to_number(pred) for pred in decoded_preds]
    decoded_labels = [to_number(label) for label in decoded_labels]

    f1_weighted = round(metric.compute(predictions=decoded_preds, references=decoded_labels, average="weighted")['f1'], 4)
    f1_macro = round(metric.compute(predictions=decoded_preds, references=decoded_labels, average="macro")['f1'], 4)
    return {"f1_weighted": f1_weighted, "f1_macro": f1_macro}

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [9]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import torch

# Step 4: Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 5: Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics
)



# 5. Train

In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Weighted,F1 Macro
1,0.0634,0.054499,0.5726,0.4514




TrainOutput(global_step=2484, training_loss=0.08701461371208351, metrics={'train_runtime': 327.0099, 'train_samples_per_second': 30.381, 'train_steps_per_second': 7.596, 'total_flos': 1344620797624320.0, 'train_loss': 0.08701461371208351, 'epoch': 1.0})

In [21]:
trainer.save_model("/Models/finetuned_NT5")
tokenizer.save_pretrained('./Models/finetuned_NT5')

('./Models/finetuned_NT5\\tokenizer_config.json',
 './Models/finetuned_NT5\\special_tokens_map.json',
 './Models/finetuned_NT5\\vocab.json',
 './Models/finetuned_NT5\\merges.txt',
 './Models/finetuned_NT5\\added_tokens.json',
 './Models/finetuned_NT5\\tokenizer.json')

# 6. Eval

In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/big_boi')

In [10]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")



Evaluation results: {'eval_loss': 0.05449896305799484, 'eval_f1_weighted': 0.5726, 'eval_f1_macro': 0.4514, 'eval_runtime': 74.766, 'eval_samples_per_second': 41.249, 'eval_steps_per_second': 10.312, 'epoch': 1.0}


# 7. Predictions

In [19]:
val_dataset = pd.read_csv(VAL_LOCATION, index_col=0)
val_dataset['evidences'] = val_dataset['evidences'].apply(literal_eval)
val_dataset['scores'] = val_dataset['scores'].apply(literal_eval)

In [20]:
import re
import pandas as pd

# Function to read and parse the file
def parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    claims = re.findall(r'Claim: (.*)', data)
    return claims

# Parse the provided file
file_path = './qualitative_analysis.txt'
claims_to_filter = parse_file(file_path)

# Ensure claims are stripped of any leading/trailing whitespace
claims_to_filter = [claim.strip() for claim in claims_to_filter]

In [21]:
# Load val_dataset from a CSV or other source if not already loaded
# val_dataset = pd.read_csv('path_to_val_dataset.csv')

# Filter the dataset to include only the rows with claims in the extracted list
filtered_val_dataset = val_dataset[val_dataset['claim'].isin(claims_to_filter)]

In [23]:
# Create a dataset from the parsed claims and evidences
dataset = Dataset.from_pandas(filtered_val_dataset)

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

Map: 100%|█████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 125.75 examples/s]


In [24]:
predictions = trainer.predict(tokenized_dataset)



In [26]:
# Decode the predictions
decoded_preds = [tokenizer.decode(pred.argmax(axis=-1), skip_special_tokens=True) if len(pred.shape) > 2 else tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.predictions]

# Print claims, evidences, and their corresponding predictions
for i, (claim, evidence, pred) in enumerate(zip(claims, evidences, decoded_preds)):
    print(f"Claim {i+1}: {claim}")
    print(f"Evidence {i+1}: {evidence}")
    print(f"Prediction {i+1}: {pred}")
    print("-" * 50)

Claim 1: “[Sugar-sweetened drinks] account for just 3% of daily calorie intake in South Africa.”
Evidence 1: 'one litre of sugar-sweetened drinks was assumed to contain 430 calories. a daily intake of 184 ml would account for approximately 79 calories. this works out to 3.2% of an adult south african mans recommended diet and 3.9% of an adult south african womans recommended diet. this percentage is not representative of the average south african as claimed by the beverages association. people surveyed for the south african national health and nutrition examination survey were 15 years and older. the study did not include any data on how much sugar-sweetened drinks children under the age of 15 consume.' 'while the sugar tax is aimed at reducing the overall calorie intake by south africans it might not be very effective. according to beverage association of south africa ~97% of south africas obesity problems had nothing to do with sugary drinks as they accounted for only 3% of the avera