In [3]:
# Convert duplets
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
T5_PATH = os.path.join(BASE_DIR, "storage", "t5_duplets.txt")


def export_duplets_for_t5(p_duplets, output_path=T5_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            p_input_text = f"normalize: {inflected}"
            p_target_text = base
            f.write(f"{p_input_text}\t{p_target_text}\n")
    print(f"Exported {len(p_duplets)} duplets to {output_path}")


export_duplets_for_t5(duplets + new_duplets)

Exported 358 duplets to D:\WORKSPACE\Python\scraper-news\storage\t5_duplets.txt


In [4]:
# Load the File
examples = []
with open(T5_PATH, encoding="utf-8") as f:
    for line in f:
        input_text, target_text = line.strip().split("\t")
        examples.append((input_text, target_text))

In [5]:
# Tokenize for T5

# t5_model = "google/mt5-small"
t5_model = "t5-small"

from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(t5_model, legacy=False)

inputs = [tokenizer(x, truncation=True, padding="max_length", max_length=32, return_tensors="pt") for x, _ in examples]
labels = [tokenizer(y, truncation=True, padding="max_length", max_length=32, return_tensors="pt").input_ids for _, y in
          examples]

In [6]:
# Train with T5ForConditionalGeneration
import torch
import time

if "mt5" in t5_model:
    from transformers import MT5ForConditionalGeneration
    model = MT5ForConditionalGeneration.from_pretrained(t5_model)
else:
    from transformers import T5ForConditionalGeneration
    model = T5ForConditionalGeneration.from_pretrained(t5_model)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

num_epochs = 50
limit = num_epochs / 5
min_loss = float("inf")
best_model_path = "t5_decorator_model_best"

for epoch in range(num_epochs):
    start = time.time()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for input_batch, label_batch in zip(inputs, labels):
        output = model(**input_batch, labels=label_batch)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    duration = time.time() - start
    print("Tokenizer test:")
    for word in ["Dan Șucu","Șucu", "Țiriac", "Șimon", "Țăranu", "Sorana Cîrstea", "Soranei Cîrstea", "Iuliu Mureșan"]:
        print(f"{word} → {tokenizer.tokenize(word)}")
    print(f"Total loss: {total_loss:.4f} - {duration:.3f}s")

    # Save best model only in the last 5 epochs
    if epoch >= num_epochs - limit and total_loss < min_loss:
        min_loss = total_loss
        model.save_pretrained(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        print(f"✅ Saved best model at epoch {epoch + 1} with loss {total_loss:.4f}")


Epoch 1/50
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț', 'i', 'r', 'i', 'a', 'c']
Șimon → ['▁', 'Ș', 'i', 'mon']
Țăranu → ['▁', 'Ț', 'ă', 'ran', 'u']
Sorana Cîrstea → ['▁So', 'ran', 'a', '▁C', 'î', 'r', 'ste', 'a']
Soranei Cîrstea → ['▁So', 'ran', 'e', 'i', '▁C', 'î', 'r', 'ste', 'a']
Iuliu Mureșan → ['▁I', 'ul', 'i', 'u', '▁Mur', 'e', 'ș', 'an']
Total loss: 391.6720 - 223.516s

Epoch 2/50
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț', 'i', 'r', 'i', 'a', 'c']
Șimon → ['▁', 'Ș', 'i', 'mon']
Țăranu → ['▁', 'Ț', 'ă', 'ran', 'u']
Sorana Cîrstea → ['▁So', 'ran', 'a', '▁C', 'î', 'r', 'ste', 'a']
Soranei Cîrstea → ['▁So', 'ran', 'e', 'i', '▁C', 'î', 'r', 'ste', 'a']
Iuliu Mureșan → ['▁I', 'ul', 'i', 'u', '▁Mur', 'e', 'ș', 'an']
Total loss: 75.9253 - 210.408s

Epoch 3/50
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț',

In [7]:
# Save and tokenize
model_path = "t5_decorator_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('t5_decorator_model\\tokenizer_config.json',
 't5_decorator_model\\special_tokens_map.json',
 't5_decorator_model\\spiece.model',
 't5_decorator_model\\added_tokens.json')

In [10]:
# Load model
from transformers import T5Tokenizer, T5ForConditionalGeneration
best_model_path = "t5_decorator_model_best"
model_path = "t5_decorator_model"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

best_tokenizer = T5Tokenizer.from_pretrained(best_model_path)
best_model = T5ForConditionalGeneration.from_pretrained(best_model_path)

In [11]:
# Test cases
from storage.duplets_dictionary import test_cases
from service.util.declension_util import DeclensionUtil

model.eval()
for input_text, expected_output in test_cases:
    scripted_model_output = DeclensionUtil.normalize(input_text, (tokenizer, model))
    best_scripted_model_output = DeclensionUtil.normalize(input_text, (best_tokenizer, best_model))
    if scripted_model_output != expected_output:
        print(f"❌ {input_text} → {best_scripted_model_output} - {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} →  {best_scripted_model_output} - {scripted_model_output}")

✅ copilului →  copil - copil
✅ copiii →  copii - copii
✅ mamei →  mamă - mamă
✅ mamelor →  mame - mame
✅ fratelui →  frate - frate
✅ sportivilor →  sportivi - sportivi
✅ echipelor locale →  echipe locale - echipe locale
✅ manelei →  manea - manea
✅ acadelei →  acadea - acadea
✅ Stelei București →  Steaua București - Steaua București
✅ Stelei →  Steaua - Steaua
✅ Unirii →  Unirea - Unirea
✅ Sibiului →  Sibiu - Sibiu
✅ jucătoarei →  jucătoare - jucătoare
✅ fotbalistelor tinere →  fotbaliste tinere - fotbaliste tinere
✅ rugbiștilor francezi →  rugbiști francezi - rugbiști francezi
✅ cetățeanului →  cetățean - cetățean
✅ comisiilor →  comisii - comisii
✅ prefecților →  prefecți - prefecți
✅ Sebeșului →  Sebeș - Sebeș
✅ Soranei →  Sorana - Sorana
✅ Sorana Cîrstea →  Sorana Cîrstea - Sorana Cîrstea
✅ Gigi Becali →  Gigi Becali - Gigi Becali
✅ Soranei Cîrstea →  Sorana Cîrstea - Sorana Cîrstea
✅ prahovean →  prahovean - prahovean
✅ prahoveancă →  prahoveană - prahoveană
✅ piteștean →  pitește