# 0. Configuration

In [1]:
MODEL_NAME = "google/flan-t5-large"

BASE_LOCATION = "/content/drive/My Drive/"
DATA_LOCATION = BASE_LOCATION + "dataset-decomposed.jsonl"

LEARNING_RATE = 1e-5
BATCH_SIZE = 4
NUM_EPOCHS = 10

# 1. Install dependencies

In [None]:
!pip install -q --upgrade transformers datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 wh

# 2. Load model

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# 3. Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from datasets import Dataset, load_dataset
from ast import literal_eval

# train_dataset = pd.read_csv(TRAIN_LOCATION, index_col=0)
# train_dataset['evidences'] = train_dataset['evidences'].apply(literal_eval)
# train_dataset['scores'] = train_dataset['scores'].apply(literal_eval)
# train_dataset = Dataset.from_pandas(train_dataset)

dataset = load_dataset('json', data_files=[DATA_LOCATION])
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

# train()

# val_dataset = pd.read_csv(VAL_LOCATION, index_col=0)
# val_dataset['evidences'] = val_dataset['evidences'].apply(literal_eval)
# val_dataset['scores'] = val_dataset['scores'].apply(literal_eval)
# val_dataset = Dataset.from_pandas(val_dataset)

In [26]:
val_dataset

Dataset({
    features: ['label', 'claim', 'subquestions', 'subquestions_retrieve', 'claim_evidences', 'subquestions_evidence', 'combined_evidence'],
    num_rows: 100
})

# 4. Preprocess data

In [42]:
def generate_prompt(claim, evidences, subquestions):
    prompt_begin = (
        f"You are a fact-checker. Your task is to classify the claim based on the following evidence.\n\n"
        f"Claim: {claim}\n\n"
        f"{subquestions}\n\n"
        f"Evidences:\n"
    )
    prompt_evidences = f""
    prompt_end = f"\n\nLabel (TRUE, FALSE, CONFLICTING):"

    for k in range(0, min(5, len(evidences))):
        new_evidence = f"\n{k + 1}. {evidences[k]}"
        tokens = tokenizer(prompt_begin + prompt_evidences + new_evidence + prompt_end, max_length=512, truncation=True, return_tensors="pt")
        if tokens['input_ids'].shape[1] < tokenizer.model_max_length:
            prompt_evidences += new_evidence
        else:
            break
    return prompt_begin + prompt_evidences + prompt_end

def preprocess_function(examples):
    inputs = [generate_prompt(examples['claim'][i], examples['combined_evidence'][i], '\n'.join(examples['subquestions'][i])) for i in range(len(examples["claim"]))]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["label"], max_length=16, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [43]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_validation = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

In [44]:
# train_dataset['combined_evidence'][0][0]

print(generate_prompt(train_dataset['claim'][0], train_dataset['combined_evidence'][0], '\n'.join(train_dataset['subquestions'][0])))

You are a fact-checker. Your task is to classify the claim based on the following evidence.

Claim: A 29-year-old San Diego Zoo intern was arrested and charged with attempting sexually assault a gorilla.

Question 0: Where was the gorilla being held?
Question 1: Where was the San Diego Zoo intern charged with?
Question 2: Was #2 located within the San Diego Zoo?

Evidences:

1. a gorilla in the kano state zoological gardens has been accused of swallowing n6.8 million. according to bbc pidgin, a radio station in kano, freedom radio claimed that one of the finance officers said that a gorilla swallowed the missing money.
2. ... intern in the zoo of san diego. he has been formally accused of repeated sexual assault on a 15 years old male gorilla named big george which is one of the ...
3. dec 13, 2020  san diego, california | a san diego zoo employee has been formally charged with repeated sexual assault on a 15-year-old male gorilla.
4. ... san diego zoo employee has been formally accuse

# 4. Prepare training

In [45]:
from datasets import load_metric
import numpy as np

metric = load_metric("f1")

def to_number(label):
  match label:
    case "TRUE":
      return 0
    case "FALSE":
      return 1
    case "CONFLICTING":
      return 2
    case _:
      return 3

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [to_number(pred) for pred in decoded_preds]
    decoded_labels = [to_number(label) for label in decoded_labels]

    f1_weighted = round(metric.compute(predictions=decoded_preds, references=decoded_labels, average="weighted")['f1'], 4)
    f1_macro = round(metric.compute(predictions=decoded_preds, references=decoded_labels, average="macro")['f1'], 4)
    return {"f1_weighted": f1_weighted, "f1_macro": f1_macro}

  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

The repository for f1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/f1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [46]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import torch

# Step 4: Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 5: Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics
)



# 5. Train

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Weighted,F1 Macro
1,No log,0.182474,0.5634,0.2722
2,No log,0.066878,0.5634,0.2722
3,3.988100,0.0578,0.5634,0.2722




TrainOutput(global_step=675, training_loss=2.9841685683638963, metrics={'train_runtime': 915.1933, 'train_samples_per_second': 2.95, 'train_steps_per_second': 0.738, 'total_flos': 6222873467289600.0, 'train_loss': 2.9841685683638963, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/drive/My Drive/t5_large_finetuned_4_epochs")

# 6. Eval

In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/big_boi')

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")



Evaluation results: {'eval_loss': 38.46904754638672, 'eval_f1_weighted': 0.3205, 'eval_f1_macro': 0.2641, 'eval_runtime': 288.144, 'eval_samples_per_second': 10.703, 'eval_steps_per_second': 2.676}
