In [1]:
# Convert duplets
import os

from service.util.root_dir_util import get_project_root
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = get_project_root()
T5_PATH = os.path.join(BASE_DIR, "storage", "t5_duplets.txt")


def export_duplets_for_t5(p_duplets, output_path=T5_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            p_input_text = f"normalize: {inflected}"
            p_target_text = base
            f.write(f"{p_input_text}\t{p_target_text}\n")
    print(f"Exported {len(p_duplets)} duplets to {output_path}")


export_duplets_for_t5(duplets + new_duplets)

Exported 210 duplets to D:\WORKSPACE\Python\scraper-news\storage\t5_duplets.txt


In [2]:
# Load the File
examples = []
with open(T5_PATH, encoding="utf-8") as f:
    for line in f:
        input_text, target_text = line.strip().split("\t")
        examples.append((input_text, target_text))

In [3]:
# Tokenize for T5
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

inputs = [tokenizer(x, truncation=True, padding="max_length", max_length=32, return_tensors="pt") for x, _ in examples]
labels = [tokenizer(y, truncation=True, padding="max_length", max_length=32, return_tensors="pt").input_ids for _, y in
          examples]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Train with T5ForConditionalGeneration
import torch
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

num_epochs = 25
for epoch in range(num_epochs):
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for input_batch, label_batch in zip(inputs, labels):
        output = model(**input_batch, labels=label_batch)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Total loss: {total_loss:.4f}")


Epoch 1/25
Total loss: 344.5781

Epoch 2/25
Total loss: 93.4381

Epoch 3/25
Total loss: 35.2987

Epoch 4/25
Total loss: 22.7287

Epoch 5/25
Total loss: 16.1046

Epoch 6/25
Total loss: 13.6744

Epoch 7/25
Total loss: 10.8044

Epoch 8/25
Total loss: 8.1999

Epoch 9/25
Total loss: 7.3118

Epoch 10/25
Total loss: 5.7504

Epoch 11/25
Total loss: 4.8015

Epoch 12/25
Total loss: 3.2231

Epoch 13/25
Total loss: 2.6592

Epoch 14/25
Total loss: 2.9448

Epoch 15/25
Total loss: 2.8496

Epoch 16/25
Total loss: 2.3320

Epoch 17/25
Total loss: 1.5473

Epoch 18/25
Total loss: 1.1272

Epoch 19/25
Total loss: 0.6384

Epoch 20/25
Total loss: 1.2233

Epoch 21/25
Total loss: 1.5442

Epoch 22/25
Total loss: 2.0216

Epoch 23/25
Total loss: 1.4437

Epoch 24/25
Total loss: 1.0740

Epoch 25/25
Total loss: 0.8682


In [5]:
# Save and tokenize
model.save_pretrained("t5_decorator_model")
tokenizer.save_pretrained("t5_decorator_model")

('t5_decorator_model\\tokenizer_config.json',
 't5_decorator_model\\special_tokens_map.json',
 't5_decorator_model\\spiece.model',
 't5_decorator_model\\added_tokens.json')

In [1]:
# Test
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5_decorator_model")
model = T5ForConditionalGeneration.from_pretrained("t5_decorator_model")
model.eval()

from storage.duplets_dictionary import normalize

# Test cases
print(normalize("Sebeșului", tokenizer, model))
print(normalize("inspectorului", tokenizer, model))
print(normalize("inspectoarei", tokenizer, model))
print(normalize("inspectorilor", tokenizer, model))
print(normalize("inspectoarelor", tokenizer, model))
print(normalize("culoarelor", tokenizer, model))
print(normalize("sectorului", tokenizer, model))
print(normalize("Brașovului", tokenizer, model))
print(normalize("Turdei", tokenizer, model))

Sebeș
inspector
inspectoare
inspectori
inspector
culoare
sector
Brașov
Turda


In [2]:
from storage.duplets import test_cases

for case in test_cases:
    print(normalize(case, tokenizer, model))

copil
mama
mame
frate
sportivi
echipe locale
manea
acade
Steaua București
Steaua
Unirea
Sibiu
jucătoare
fotbaliste tinere
rugbiști francezi
cetățean
comisii
prefecți
