In [1]:
# Convert duplets
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
HYBRID_PATH = os.path.join(BASE_DIR, "storage", "hybrid_duplets.txt")

def export_duplets_for_t5(p_duplets, output_path=HYBRID_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            p_input_text = f"normalize: {inflected}"
            p_target_text = base
            f.write(f"{p_input_text}\t{p_target_text}\n")
    print(f"Exported {len(p_duplets)} duplets to {output_path}")


export_duplets_for_t5(duplets + new_duplets)

Exported 358 duplets to D:\WORKSPACE\Python\scraper-news\storage\hybrid_duplets.txt


In [2]:
# Load the File
examples = []
with open(HYBRID_PATH, encoding="utf-8") as f:
    for line in f:
        input_text, target_text = line.strip().split("\t")
        examples.append((input_text, target_text))

In [3]:
# Tokenize for T5

# t5_model = "google/mt5-small"
t5_model = "t5-small"

from transformers import T5Tokenizer
# Load Hugging Face tokenizer
hf_tokenizer = T5Tokenizer.from_pretrained(t5_model, legacy=False)

# Load SP tokenizer
import sentencepiece as spm

sp_tokenizer = spm.SentencePieceProcessor()
sp_tokenizer.load("pre-declension.model")

# Test SP tokenization
for word in ["Șucu", "Țiriac", "Ștefănești", "Cîrstea", "Mureșan"]:
    print(f"{word} → {sp_tokenizer.tokenize(word)}")

def hybrid_tokenize(text, p_sp_tokenizer, p_hf_tokenizer):
    sp_tokens = p_sp_tokenizer.encode(text, out_type=str)
    merged = " ".join(sp_tokens)
    return p_hf_tokenizer(merged, truncation=True, padding="max_length", max_length=32, return_tensors="pt")

Șucu → [37]
Țiriac → [52]
Ștefănești → [120]
Cîrstea → [58, 90, 75]
Mureșan → [24, 16]


In [4]:
# Train with T5ForConditionalGeneration
import torch
import time

# Add known entities to tokenizer vocab
seen = set()
for _, second in duplets:
    seen.add(second)

# hf_tokenizer.add_tokens(list(seen))

if "mt5" in t5_model:
    from transformers import MT5ForConditionalGeneration
    model = MT5ForConditionalGeneration.from_pretrained(t5_model)
else:
    from transformers import T5ForConditionalGeneration
    model = T5ForConditionalGeneration.from_pretrained(t5_model)

model.resize_token_embeddings(len(hf_tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32191, 512)

In [5]:
# Tokenize inputs and labels
inputs = [hybrid_tokenize(x, sp_tokenizer, hf_tokenizer) for x, _ in examples]
labels = [hybrid_tokenize(y, sp_tokenizer, hf_tokenizer)["input_ids"] for _, y in examples]

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

num_epochs = 20
limit = num_epochs / 5
min_loss = float("inf")
best_hybrid_model_path = "hybrid_declension_model_best"

for epoch in range(num_epochs):
    start = time.time()
    total_loss = 0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for input_batch, label_batch in zip(inputs, labels):
        output = model(**input_batch, labels=label_batch)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    duration = time.time() - start
    print("Tokenizer test:")
    for word in ["Dan Șucu","Șucu", "Țiriac", "Șimon", "Țăranu", "Sorana Cîrstea", "Soranei Cîrstea", "Iuliu Mureșan"]:
        print(f"{word} → {hf_tokenizer.tokenize(word)}")
    print(f"Total loss: {total_loss:.4f} - {duration:.3f}s")

    # Save best model only in the last 5 epochs
    if epoch >= num_epochs - limit and total_loss < min_loss:
        min_loss = total_loss
        model.save_pretrained(best_hybrid_model_path)
        hf_tokenizer.save_pretrained(best_hybrid_model_path)
        print(f"✅ Saved best model at epoch {epoch + 1} with loss {total_loss:.4f}")


Epoch 1/20
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț', 'i', 'r', 'i', 'a', 'c']
Șimon → ['▁', 'Ș', 'i', 'mon']
Țăranu → ['▁', 'Ț', 'ă', 'ran', 'u']
Sorana Cîrstea → ['▁So', 'ran', 'a', '▁C', 'î', 'r', 'stea']
Soranei Cîrstea → ['▁So', 'ran', 'e', 'i', '▁C', 'î', 'r', 'stea']
Iuliu Mureșan → ['▁I', 'ul', 'i', 'u', '▁Mur', 'e', 'ș', 'an']
Total loss: 428.5199 - 224.499s

Epoch 2/20
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț', 'i', 'r', 'i', 'a', 'c']
Șimon → ['▁', 'Ș', 'i', 'mon']
Țăranu → ['▁', 'Ț', 'ă', 'ran', 'u']
Sorana Cîrstea → ['▁So', 'ran', 'a', '▁C', 'î', 'r', 'stea']
Soranei Cîrstea → ['▁So', 'ran', 'e', 'i', '▁C', 'î', 'r', 'stea']
Iuliu Mureșan → ['▁I', 'ul', 'i', 'u', '▁Mur', 'e', 'ș', 'an']
Total loss: 159.4229 - 214.568s

Epoch 3/20
Tokenizer test:
Dan Șucu → ['▁Dan', '▁', 'Ș', 'u', 'cu']
Șucu → ['▁', 'Ș', 'u', 'cu']
Țiriac → ['▁', 'Ț', 'i', 'r', 'i',

KeyboardInterrupt: 

In [None]:
# Save and tokenize
hybrid_model_path = "hybrid_declension_model"
model.save_pretrained(hybrid_model_path)
hf_tokenizer.save_pretrained(hybrid_model_path)

In [None]:
# Load model
from transformers import T5Tokenizer, T5ForConditionalGeneration
best_hybrid_model_path = "hybrid_declension_model_best"
hybrid_model_path = "hybrid_declension_model"
hf_tokenizer = T5Tokenizer.from_pretrained(hybrid_model_path)
model = T5ForConditionalGeneration.from_pretrained(hybrid_model_path)

best_tokenizer = T5Tokenizer.from_pretrained(best_hybrid_model_path)
best_model = T5ForConditionalGeneration.from_pretrained(best_hybrid_model_path)

In [None]:
# Test cases
from storage.duplets_dictionary import test_cases
from service.util.declension_util import DeclensionUtil

model.eval()
for input_text, expected_output in test_cases:
    scripted_model_output = DeclensionUtil.normalize(input_text, (hf_tokenizer, model))
    best_scripted_model_output = DeclensionUtil.normalize(input_text, (best_tokenizer, best_model))
    if scripted_model_output != expected_output:
        print(f"❌ {input_text} → {best_scripted_model_output} - {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} →  {best_scripted_model_output} - {scripted_model_output}")