In [1]:
# Setup
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
CORPUS_PATH = os.path.join(BASE_DIR, "storage", "declension_corpus.txt")


def export_duplets_to_sp(p_duplets, output_path=CORPUS_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            f.write(f"{inflected}\n")
            f.write(f"{base}\n")
    print(f"Exported {len(p_duplets) * 2} lines to {output_path}")


duplets = duplets + new_duplets
export_duplets_to_sp(duplets)

Exported 482 lines to D:\WORKSPACE\Python\scraper-news\storage\declension_corpus.txt


In [2]:
#  Train SentencePiece tokenizer

import sentencepiece as spm

# noinspection PyUnresolvedReferences
spm.SentencePieceTrainer.train(
    input=CORPUS_PATH,
    model_prefix="declension",
    vocab_size=800,
    character_coverage=1.0,
    model_type="bpe"
)

In [3]:
# Load tokenizer

import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("declension.model")

True

In [4]:
# Prepare training data with batches

import torch
from torch.nn.functional import pad

vocab_size = sp.get_piece_size()
pad_id = sp.pad_id()
if pad_id < 0:
    pad_id = vocab_size
    vocab_size += 1

input_tensors = []
label_tensors = []

for x, y in duplets:
    x_ids = torch.tensor(sp.encode(x, out_type=int))
    y_ids = torch.tensor(sp.encode(y, out_type=int))
    assert x_ids.max().item() < vocab_size
    assert y_ids.max().item() < vocab_size
    input_tensors.append(x_ids)
    label_tensors.append(y_ids)

lengths = [max(len(x), len(y)) for x, y in zip(input_tensors, label_tensors)]
max_len = max(lengths)

# Print only if it exceeds thresholds
if max_len > 24:
    print(f"🚨 Max sequence length is {max_len} — consider using 32 or higher")
elif max_len > 16:
    print(f"⚠️ Max sequence length is {max_len} — 24 might be sufficient")

import numpy as np
lengths = [max(len(x), len(y)) for x, y in zip(input_tensors, label_tensors)]
print(f"ℹ️ Max: {max(lengths)}, 95th percentile: {np.percentile(lengths, 95)}")

# ✅ Static max length
max_seq_len = 32

# Pad to fixed length
inputs = [pad(x, (0, max_seq_len - len(x)), value=pad_id)[:max_seq_len] for x in input_tensors]
labels = [pad(y, (0, max_seq_len - len(y)), value=pad_id)[:max_seq_len] for y in label_tensors]

ℹ️ Max: 4, 95th percentile: 3.0


In [5]:
# Train the model

from torch.optim import AdamW
from model.declension_model import DeclensionModel
import torch.nn as nn

hidden_dim = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DeclensionModel(vocab_size, hidden_dim, max_seq_len=max_seq_len).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)

model.train()
for epoch in range(50):
    total_loss = 0
    for input_ids, label_ids in zip(inputs, labels):
        input_ids = input_ids.unsqueeze(0).to(device)  # shape: [1, seq_len]
        label_ids = label_ids.unsqueeze(0).to(device)  # shape: [1, seq_len]

        logits = model(input_ids)  # shape: [1, seq_len, vocab_size]
        loss = loss_fn(logits.view(-1, vocab_size), label_ids.view(-1))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    if epoch % 10 == 0 or (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}: Loss = {total_loss:.4f}")
        with torch.no_grad():
            for x in ["copilului", "mamei",  "Sorana Cîrstea", "Gigi Becali", "Soranei Cîrstea", "Iuliu Mureșan"]:
                x_ids = torch.tensor(sp.encode(x, out_type=int)).unsqueeze(0).to(device)
                output = model(x_ids)
                pred_ids = torch.argmax(output, dim=-1)
                print(f"{x} → {sp.decode(pred_ids[0].tolist())}")

Epoch 1: Loss = 1569.2928
copilului → copil
mamei → mamă
Sorana Cîrstea → mamă mamă
Gigi Becali → mamă mamă engleze echipaorului femei patinatoarelor comisief
Soranei Cîrstea → mamă engleze
Iuliu Mureșan → mamă atenți finlandezi engleze femei
Epoch 10: Loss = 391.8344
copilului → copil
mamei → mamă
Sorana Cîrstea → Sorana Cîrstea
Gigi Becali → acadea tineri-africanafricanafrican- străin engleze
Soranei Cîrstea → Sorana Cîrstea
Iuliu Mureșan → Iași atenți-africanafrican
Epoch 11: Loss = 335.1819
copilului → copil
mamei → mamă
Sorana Cîrstea → Sorana Cîrstea
Gigi Becali → acadea tineri-africanafricanafrican- străin copii
Soranei Cîrstea → Sorana Cîrstea
Iuliu Mureșan → Iași atenți-africanafrican
Epoch 20: Loss = 68.0228
copilului → copil
mamei → mamă
Sorana Cîrstea → Sorana Cîrstea
Gigi Becali → acadea tineri-africanafricanafrican patinatoare prieten copii
Soranei Cîrstea → Sorana Cîrstea
Iuliu Mureșan → Iași Jiu-africanafrican
Epoch 21: Loss = 57.6393
copilului → copil
mamei → mamă
Sora

In [6]:
# Torch script export

scripted_model = torch.jit.script(model)
scripted_model.save("declension_sentencepiece.pt")

In [7]:
# Test prediction
from service.util.declension_util import DeclensionUtil
from storage.duplets_dictionary import test_cases

model.eval()
scripted_model.eval()

for input_text, expected_output in test_cases:
    predicted_output = DeclensionUtil.predict(model, sp, input_text)
    scripted_model_output = DeclensionUtil.predict(scripted_model, sp, input_text)
    if predicted_output != expected_output or scripted_model_output != expected_output:
        print(f"❌ {input_text} → {predicted_output} - {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} → {predicted_output} - {scripted_model_output}")


✅ copilului → copil - copil
✅ copiii → copii - copii
✅ mamei → mamă - mamă
✅ mamelor → mame - mame
✅ fratelui → frate - frate
✅ sportivilor → sportivi - sportivi
✅ echipelor locale → echipe locale - echipe locale
✅ manelei → manea - manea
✅ acadelei → acadea - acadea
✅ Stelei București → Steaua București - Steaua București
✅ Stelei → Steaua - Steaua
✅ Unirii → Unirea - Unirea
✅ Sibiului → Sibiu - Sibiu
✅ jucătoarei → jucătoare - jucătoare
✅ fotbalistelor tinere → fotbaliste tinere - fotbaliste tinere
✅ rugbiștilor francezi → rugbiști francezi - rugbiști francezi
✅ cetățeanului → cetățean - cetățean
✅ comisiilor → comisii - comisii
✅ prefecților → prefecți - prefecți
✅ Sebeșului → Sebeș - Sebeș
✅ Soranei → Sorana - Sorana
✅ Sorana Cîrstea → Sorana Cîrstea - Sorana Cîrstea
❌ Gigi Becali → acadea sud-africanafricanafrican patinatoare fanion copii - acadea sud-africanafricanafrican patinatoare fanion copii (expected: Gigi Becali)
✅ Soranei Cîrstea → Sorana Cîrstea - Sorana Cîrstea
❌ Iuliu 

In [8]:
import torch
from service.util.declension_util import DeclensionUtil
from storage.duplets_dictionary import test_cases

import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("declension.model") # Replace with your actual model filename

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the scripted model
scripted_model = torch.jit.load("declension_sentencepiece.pt").to(device)
scripted_model.eval()

for input_text, expected_output in test_cases:
    scripted_model_output = DeclensionUtil.predict(scripted_model, sp, input_text)
    if scripted_model_output != expected_output:
        print(f"❌ {input_text} → {scripted_model_output} (expected: {expected_output})")
    else:
        print(f"✅ {input_text} → {scripted_model_output}")

✅ copilului → copil
✅ copiii → copii
✅ mamei → mamă
✅ mamelor → mame
✅ fratelui → frate
✅ sportivilor → sportivi
✅ echipelor locale → echipe locale
✅ manelei → manea
✅ acadelei → acadea
✅ Stelei București → Steaua București
✅ Stelei → Steaua
✅ Unirii → Unirea
✅ Sibiului → Sibiu
✅ jucătoarei → jucătoare
✅ fotbalistelor tinere → fotbaliste tinere
✅ rugbiștilor francezi → rugbiști francezi
✅ cetățeanului → cetățean
✅ comisiilor → comisii
✅ prefecților → prefecți
✅ Sebeșului → Sebeș
✅ Soranei → Sorana
✅ Sorana Cîrstea → Sorana Cîrstea
❌ Gigi Becali → acadea sud-africanafricanafrican patinatoare fanion copii (expected: Gigi Becali)
✅ Soranei Cîrstea → Sorana Cîrstea
❌ Iuliu Mureșan → Iași Jiu statafricanafrican (expected: Iuliu Mureșan)
