In [None]:
import json
import random
import numpy as np
import evaluate
import Levenshtein
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    TrainingArguments, 
    Trainer
)


In [None]:
all_data = []
with open("../data/t5_tagged_training_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            example = json.loads(line)
            if example.get("target", "").strip():
                all_data.append(example)

random.seed(42)
random.shuffle(all_data)

n = len(all_data)
train_data = all_data[:int(0.8 * n)]
val_data = all_data[int(0.8 * n):int(0.9 * n)]
test_data = all_data[int(0.9 * n):]

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

print(dataset)

In [None]:
dataset["train"].to_json("..data/train.jsonl")
dataset["validation"].to_json("../data/val.jsonl")
dataset["test"].to_json("../data/test.jsonl")


In [94]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [95]:
def preprocess(example):
    model_input = tokenizer(
        example["input"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_input["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in labels["input_ids"]
    ]
    return model_input


In [96]:
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=["input", "target"]
)


Map:   0%|          | 0/29887 [00:00<?, ? examples/s]



Map:   0%|          | 0/3736 [00:00<?, ? examples/s]



Map:   0%|          | 0/3736 [00:00<?, ? examples/s]



In [97]:

model = T5ForConditionalGeneration.from_pretrained("t5-base")


In [98]:


data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [101]:


training_args = TrainingArguments(
    output_dir="./t5_citation_normalizer_v2",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,  # ✅ small to avoid memory issues
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=500,
    save_steps=999999,  # ✅ save only at end
    save_total_limit=1,
    prediction_loss_only=False,    # ✅ enables metrics
    fp16=torch.cuda.is_available()
)


In [102]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    exact_match = np.mean([pred == label for pred, label in zip(decoded_preds, decoded_labels)])
    lev_similarity = np.mean([
        1 - Levenshtein.distance(pred, label) / max(len(pred), len(label), 1)
        for pred, label in zip(decoded_preds, decoded_labels)
    ])
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {
        "exact_match": exact_match,
        "levenshtein_similarity": lev_similarity,
        "rougeL": rouge_result["rougeL"]
    }


In [103]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
500,0.4131
1000,0.0357
1500,0.0245
2000,0.0209
2500,0.0169
3000,0.0164
3500,0.014
4000,0.0133
4500,0.0127
5000,0.0117


TrainOutput(global_step=11208, training_loss=0.03217392336633017, metrics={'train_runtime': 2055.4765, 'train_samples_per_second': 43.621, 'train_steps_per_second': 5.453, 'total_flos': 5.459977342550016e+16, 'train_loss': 0.03217392336633017, 'epoch': 3.0})

In [105]:
output_dir = "./t5_base_citation_normalisation"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


('./t5_base_citation_normalisation/tokenizer_config.json',
 './t5_base_citation_normalisation/special_tokens_map.json',
 './t5_base_citation_normalisation/spiece.model',
 './t5_base_citation_normalisation/added_tokens.json')

In [107]:
import shutil
shutil.make_archive("t5_base_citation_normalisation", 'zip', output_dir)


'/content/t5_base_citation_normalisation.zip'