In [1]:
# Convert duplets
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
T5_PATH = os.path.join(BASE_DIR, "storage", "t5_duplets.txt")


def export_duplets_for_t5(p_duplets, output_path=T5_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            p_input_text = f"normalize: {inflected}"
            p_target_text = base
            f.write(f"{p_input_text}\t{p_target_text}\n")
    print(f"Exported {len(p_duplets)} duplets to {output_path}")


export_duplets_for_t5(duplets + new_duplets)

Exported 269 duplets to D:\WORKSPACE\Python\scraper-news\storage\t5_duplets.txt


In [2]:
# Load the File
examples = []
with open(T5_PATH, encoding="utf-8") as f:
    for line in f:
        input_text, target_text = line.strip().split("\t")
        examples.append((input_text, target_text))

In [3]:
# Tokenize for T5
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

inputs = [tokenizer(x, truncation=True, padding="max_length", max_length=32, return_tensors="pt") for x, _ in examples]
labels = [tokenizer(y, truncation=True, padding="max_length", max_length=32, return_tensors="pt").input_ids for _, y in
          examples]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Train with T5ForConditionalGeneration
import torch
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

num_epochs = 25
for epoch in range(num_epochs):
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for input_batch, label_batch in zip(inputs, labels):
        output = model(**input_batch, labels=label_batch)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Total loss: {total_loss:.4f}")


Epoch 1/25
Total loss: 384.9751

Epoch 2/25
Total loss: 85.0910

Epoch 3/25
Total loss: 35.7481

Epoch 4/25
Total loss: 24.2992

Epoch 5/25
Total loss: 18.6546

Epoch 6/25
Total loss: 13.2448

Epoch 7/25
Total loss: 11.4376

Epoch 8/25
Total loss: 9.2274

Epoch 9/25
Total loss: 7.7718

Epoch 10/25
Total loss: 6.3294

Epoch 11/25
Total loss: 5.9553

Epoch 12/25
Total loss: 4.4615

Epoch 13/25
Total loss: 3.9242

Epoch 14/25
Total loss: 3.9316

Epoch 15/25
Total loss: 2.8685

Epoch 16/25
Total loss: 3.0799

Epoch 17/25
Total loss: 2.2513

Epoch 18/25
Total loss: 1.6723

Epoch 19/25
Total loss: 1.9172

Epoch 20/25
Total loss: 2.0043

Epoch 21/25
Total loss: 1.9896

Epoch 22/25
Total loss: 1.8842

Epoch 23/25
Total loss: 1.5445

Epoch 24/25
Total loss: 1.4924

Epoch 25/25
Total loss: 1.5767


In [5]:
# Save and tokenize
model.save_pretrained("t5_decorator_model")
tokenizer.save_pretrained("t5_decorator_model")

('t5_decorator_model\\tokenizer_config.json',
 't5_decorator_model\\special_tokens_map.json',
 't5_decorator_model\\spiece.model',
 't5_decorator_model\\added_tokens.json')

In [1]:
# Test
from storage.duplets_dictionary import test_cases
from service.util.declension_util import DeclensionUtil

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5_decorator_model")
model = T5ForConditionalGeneration.from_pretrained("t5_decorator_model")
model.eval()

# Test cases
for input_text, expected_output in test_cases:
    scripted_model_output = DeclensionUtil.normalize(input_text, (tokenizer, model))
    if scripted_model_output != expected_output:
        print(f"❌ {input_text} → {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} → {scripted_model_output}")

✅ copilului → copil
✅ copiii → copii
✅ mamei → mamă
✅ mamelor → mame
✅ fratelui → frate
✅ sportivilor → sportivi
✅ echipelor locale → echipe locale
✅ manelei → manea
✅ acadelei → acadea
✅ Stelei București → Steaua București
✅ Stelei → Steaua
✅ Unirii → Unirea
✅ Sibiului → Sibiu
✅ jucătoarei → jucătoare
✅ fotbalistelor tinere → fotbaliste tinere
✅ rugbiștilor francezi → rugbiști francezi
✅ cetățeanului → cetățean
✅ comisiilor → comisii
✅ prefecților → prefecți
✅ Sebeșului → Sebeș
✅ Soranei → Sorana
✅ Sorana Cîrstea → Sorana Cîrstea
✅ Gigi Becali → Gigi Becali
✅ Soranei Cîrstea → Sorana Cîrstea
✅ Iuliu Mureșan → Iuliu Mureșan
✅ prahovean → prahovean
✅ prahoveancă → prahoveană
✅ piteștean → piteștean
