In [99]:
# Load Base Model + LoRA

from transformers import AutoModel, AutoTokenizer
from peft import get_peft_model, LoraConfig

model_name = "dumitrescustefan_token_output/checkpoint-200"

base_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="FEATURE_EXTRACTION"
)

model = get_peft_model(base_model, lora_config)

Some weights of BertModel were not initialized from the model checkpoint at dumitrescustefan_token_output/checkpoint-200 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
# Tokenize and Embed
import torch

def embed(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=32)
    output = model(**tokens)
    return output.last_hidden_state[:, 0, :]  # [CLS] token

def embed_normalized(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=32)
    output = model(**tokens)
    cls = output.last_hidden_state[:, 0, :]
    return F.normalize(cls, p=2, dim=1)

In [102]:
# Define Contrastive Loss
import torch.nn.functional as F

def contrastive_loss(anchor, positive, negative, margin=0.5):
    sim_pos = F.cosine_similarity(anchor, positive)
    sim_neg = F.cosine_similarity(anchor, negative)
    p_loss = torch.clamp(margin - sim_pos + sim_neg, min=0.0)
    return p_loss.mean()

def pairwise_loss(variant, canonical):
    p_sim = F.cosine_similarity(variant, canonical)
    return 1 - p_sim.mean()  # maximize similarity

In [103]:
# Training Loop

from storage.duplets_dictionary import duplets

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()
for epoch in range(10):
    total_loss = 0
    anchors = torch.cat([embed(a) for a, p in duplets])
    positives = torch.cat([embed(p) for a, p in duplets])

    loss = pairwise_loss(anchors, positives)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")

Epoch 1 - Loss: 0.3718
Epoch 2 - Loss: 0.3784
Epoch 3 - Loss: 0.3587
Epoch 4 - Loss: 0.3659
Epoch 5 - Loss: 0.3656
Epoch 6 - Loss: 0.3513
Epoch 7 - Loss: 0.3416
Epoch 8 - Loss: 0.3448
Epoch 9 - Loss: 0.3251
Epoch 10 - Loss: 0.3301


In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("your-seq2seq-model")
tokenizer = AutoTokenizer.from_pretrained("your-seq2seq-model")

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()

for epoch in range(10):
    total_loss = 0
    for source, target in duplets:
        inputs = tokenizer(source, return_tensors="pt")
        labels = tokenizer(target, return_tensors="pt").input_ids
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

In [109]:
model.eval()
# with torch.no_grad():
#     sim_pos = F.cosine_similarity(embed_normalized("Rapidului"), embed_normalized("Rapid")).item()
#     sim_neg = F.cosine_similarity(embed_normalized("Rapidului"), embed_normalized("Steaua")).item()
# print(f"Rapidului vs Rapid: {sim_pos:.4f}")
# print(f"Rapidului vs Steaua: {sim_neg:.4f}")

with torch.no_grad():
    sim = F.cosine_similarity(embed("Rapidului"), embed("Rapid")).item()
    sim_norm = F.cosine_similarity(embed_normalized("Rapidului"), embed_normalized("Rapid")).item()
print(f"Rapidului vs Rapid: {sim:.4f}")
print(f"Rapidului vs Rapid: {sim_norm:.4f}")

Rapidului vs Rapid: 0.4263
Rapidului vs Rapid: 0.4263


In [105]:
# Save LoRA model
model.save_pretrained("./declension_lora")
tokenizer.save_pretrained("./declension_lora")

('./declension_lora\\tokenizer_config.json',
 './declension_lora\\special_tokens_map.json',
 './declension_lora\\vocab.txt',
 './declension_lora\\added_tokens.json',
 './declension_lora\\tokenizer.json')

In [108]:
from service.util.declension_normalizer import DeclensionNormalizer

normalizer = DeclensionNormalizer(
    canonical_forms=["România", "Rapid", "Minister", "Guvern", "Craiova"],
    embed_fn=embed
)

print(normalizer.normalize("României"))  # → "România"
print(normalizer.normalize("Rapidului"))  # → "Rapid"

Guvern
Rapidului
