In [1]:
# Convert duplets
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
SP_PATH = os.path.join(BASE_DIR, "storage", "sp_duplets.txt")


def export_duplets_for_sp(p_duplets, output_path=SP_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            p_input_text = f"normalize: {inflected}"
            p_target_text = base
            f.write(f"{p_input_text}\t{p_target_text}\n")
    print(f"Exported {len(p_duplets)} duplets to {output_path}")


export_duplets_for_sp(duplets + new_duplets)

Exported 356 duplets to D:\WORKSPACE\Python\scraper-news\storage\sp_duplets.txt


In [2]:
# Load the File
examples = []
with open(SP_PATH, encoding="utf-8") as f:
    for line in f:
        input_text, target_text = line.strip().split("\t")
        examples.append((input_text, target_text))

In [3]:
# Tokenize for T5
from transformers import T5Tokenizer

pre_declension_model = "pre-declension.model"
tokenizer = T5Tokenizer.from_pretrained(pre_declension_model, legacy=False)

print(tokenizer.tokenize("Șucu"))       # Expect: ['▁Șucu']
print(tokenizer.tokenize("Țiriac"))     # Expect: ['▁Țiriac']
print(tokenizer.tokenize("Ștefănescu")) # Expect: ['▁Ștefăneșcu']
print(tokenizer.tokenize("Ștefănești")) # Expect: ['▁Ștefănești']
print(tokenizer.tokenize("Țepelin")) # Expect: ['▁Țepelin']

inputs = [tokenizer(x, truncation=True, padding="max_length", max_length=32, return_tensors="pt") for x, _ in examples]
labels = [tokenizer(y, truncation=True, padding="max_length", max_length=32, return_tensors="pt").input_ids for _, y in
          examples]



['▁Șucu']
['▁Țiriac']
['▁Ștefănescu']
['▁Ștefănești']
['▁Țepelin']


In [6]:
# Train with T5ForConditionalGeneration
import torch
import time

# t5_model = "t5-small"
t5_model = "google/mt5-small"
if "mt5" in t5_model:
    from transformers import MT5ForConditionalGeneration
    model = MT5ForConditionalGeneration.from_pretrained(t5_model)
else:
    from transformers import T5ForConditionalGeneration
    model = T5ForConditionalGeneration.from_pretrained(t5_model)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

num_epochs = 20
limit = num_epochs / 5
min_loss = float("inf")
best_sp_model = "sp_model_best"

for epoch in range(num_epochs):
    start = time.time()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for input_batch, label_batch in zip(inputs, labels):
        output = model(**input_batch, labels=label_batch)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    duration = time.time() - start
    print("Tokenizer test:")
    for word in ["Dan Șucu","Șucu", "Țiriac", "Șimon", "Țăranu", "Sorana Cîrstea", "Soranei Cîrstea", "Iuliu Mureșan"]:
        print(f"{word} → {tokenizer.tokenize(word)}")
    print(f"Total loss: {total_loss:.4f} - {duration:.3f}s")

    # Save best model only in the last 5 epochs
    if epoch >= num_epochs - limit and total_loss < min_loss:
        min_loss = total_loss
        model.save_pretrained(best_sp_model)
        tokenizer.save_pretrained(best_sp_model)
        print(f"✅ Saved best model at epoch {epoch + 1} with loss {total_loss:.4f}")


Epoch 1/20
Tokenizer test:
Dan Șucu → ['▁D', 'an', '▁Șucu']
Șucu → ['▁Șucu']
Țiriac → ['▁Țiriac']
Șimon → ['▁Șimon']
Țăranu → ['▁Țăranu']
Sorana Cîrstea → ['▁Sorana', '▁C', 'îr', 'stea']
Soranei Cîrstea → ['▁Soranei', '▁C', 'îr', 'stea']
Iuliu Mureșan → ['▁Iuliu', '▁Mureș', 'an']
Total loss: 3902.6521 - 726.396s

Epoch 2/20
Tokenizer test:
Dan Șucu → ['▁D', 'an', '▁Șucu']
Șucu → ['▁Șucu']
Țiriac → ['▁Țiriac']
Șimon → ['▁Șimon']
Țăranu → ['▁Țăranu']
Sorana Cîrstea → ['▁Sorana', '▁C', 'îr', 'stea']
Soranei Cîrstea → ['▁Soranei', '▁C', 'îr', 'stea']
Iuliu Mureșan → ['▁Iuliu', '▁Mureș', 'an']
Total loss: 1481.1039 - 715.062s

Epoch 3/20
Tokenizer test:
Dan Șucu → ['▁D', 'an', '▁Șucu']
Șucu → ['▁Șucu']
Țiriac → ['▁Țiriac']
Șimon → ['▁Șimon']
Țăranu → ['▁Țăranu']
Sorana Cîrstea → ['▁Sorana', '▁C', 'îr', 'stea']
Soranei Cîrstea → ['▁Soranei', '▁C', 'îr', 'stea']
Iuliu Mureșan → ['▁Iuliu', '▁Mureș', 'an']
Total loss: 1072.1330 - 735.726s

Epoch 4/20
Tokenizer test:
Dan Șucu → ['▁D', 'an', '▁Ș

KeyboardInterrupt: 

In [None]:
# Save and tokenize
sp_model = "sp_model"
model.save_pretrained(sp_model)
tokenizer.save_pretrained(sp_model)

In [None]:
# Load model
from transformers import T5Tokenizer, T5ForConditionalGeneration

sp_model = "sp_model"
best_sp_model = "sp_model_best"

tokenizer = T5Tokenizer.from_pretrained(sp_model)
model = T5ForConditionalGeneration.from_pretrained(sp_model)

best_tokenizer = T5Tokenizer.from_pretrained(best_sp_model)
best_model = T5ForConditionalGeneration.from_pretrained(best_sp_model)

In [None]:
# Test cases
from storage.duplets_dictionary import test_cases
from service.util.declension_util import DeclensionUtil

model.eval()
for input_text, expected_output in test_cases:
    scripted_model_output = DeclensionUtil.normalize(input_text, (tokenizer, model))
    best_scripted_model_output = DeclensionUtil.normalize(input_text, (best_tokenizer, best_model))
    if scripted_model_output != expected_output:
        print(f"❌ {input_text} → {best_scripted_model_output} - {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} →  {best_scripted_model_output} - {scripted_model_output}")