In [None]:
import json
import random
import numpy as np
import torch
import evaluate
import Levenshtein
from datasets import Dataset, DatasetDict
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [1]:

model_path = "../t5_base_citation_normalisation"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [2]:
all_data = []
with open("t5_tagged_training_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            ex = json.loads(line)
            if ex.get("target", "").strip():
                all_data.append(ex)

random.seed(42)
random.shuffle(all_data)

n = len(all_data)
dataset = DatasetDict({
    "train": Dataset.from_list(all_data[:int(0.8 * n)]),
    "validation": Dataset.from_list(all_data[int(0.8 * n):int(0.9 * n)]),
    "test": Dataset.from_list(all_data[int(0.9 * n):])
})


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_path = "./t5_base_citation_normalisation"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [12]:
val_dataset = dataset["validation"]
val_results = []
batch_size = 64

for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset.select(range(i, min(i + batch_size, len(val_dataset))))
    inputs = tokenizer(
        [x["input"] for x in batch],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=4,
            early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    targets = [x["target"].strip() for x in batch]

    for p, t in zip(preds, targets):
        val_results.append((p.strip(), t))


100%|██████████| 59/59 [02:24<00:00,  2.45s/it]


In [13]:
rouge = evaluate.load("rouge")
preds, labels = zip(*val_results)

exact_match = np.mean([p == l for p, l in zip(preds, labels)])
lev_sim = np.mean([
    1 - Levenshtein.distance(p, l) / max(len(p), len(l), 1)
    for p, l in zip(preds, labels)
])
rouge_result = rouge.compute(predictions=preds, references=labels, use_stemmer=True)

print("Validation Results")
print(f"Exact Match: {exact_match:.4f}")
print(f"Levenshtein Similarity: {lev_sim:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")


Validation Results
Exact Match: 0.6175
Levenshtein Similarity: 0.9657
ROUGE-L: 0.9698


In [7]:
test_dataset = dataset["test"]
test_results = []
batch_size = 64


for i in tqdm(range(0, len(test_dataset), batch_size)):
    batch = test_dataset.select(range(i, min(i + batch_size, len(test_dataset))))
    inputs = tokenizer(
        [x["input"] for x in batch],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=4,
            early_stopping=True
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    targets = [x["target"].strip() for x in batch]

    for p, t in zip(preds, targets):
        test_results.append((p.strip(), t))


100%|██████████| 59/59 [02:21<00:00,  2.39s/it]


In [1]:
rouge = evaluate.load("rouge")
preds, labels = zip(*test_results)

exact_match = np.mean([p == l for p, l in zip(preds, labels)])
lev_sim = np.mean([
    1 - Levenshtein.distance(p, l) / max(len(p), len(l), 1)
    for p, l in zip(preds, labels)
])
rouge_result = rouge.compute(predictions=preds, references=labels, use_stemmer=True)

print("Test Results")
print(f"Exact Match: {exact_match:.4f}")
print(f"Levenshtein Similarity: {lev_sim:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")


NameError: name 'evaluate' is not defined

In [14]:
with open("val_predictions.jsonl", "w", encoding="utf-8") as f:
    for (pred, target), example in zip(val_results, dataset["validation"]):
        json.dump({
            "input": example["input"],
            "prediction": pred,
            "target": target
        }, f, ensure_ascii=False)
        f.write("\n")

with open("test_predictions.jsonl", "w", encoding="utf-8") as f:
    for (pred, target), example in zip(test_results, dataset["test"]):
        json.dump({
            "input": example["input"],
            "prediction": pred,
            "target": target
        }, f, ensure_ascii=False)
        f.write("\n")
