In [1]:
from src.preprocessor.utils.dataset_level import read_pickle, prepare_training_dataset_with_triplet, read_json
from itertools import islice

corpus = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/corpus.pkl')
corpus = {i: j.replace("\xa0", "") for i, j in corpus.items()}

queries = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/queries.pkl')
# queries = dict(sorted(queries.items(), key=lambda item: item[1])[:1000])

relevant_docs = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/relevant_docs.pkl')


In [2]:
selected_queries = dict(sorted(queries.items(), key=lambda item: item[1])[:10000])

# Lọc corpus và relevant_docs dựa trên selected_queries
selected_corpus = {i: corpus[i] for i in selected_queries.keys() if i in corpus}
selected_relevant_docs = {i: relevant_docs[i] for i in selected_queries.keys() if i in relevant_docs}

# Chuẩn bị dataset cho training
train_dataset = prepare_training_dataset_with_triplet(selected_queries, selected_corpus, selected_relevant_docs)
print(len(train_dataset), len(selected_corpus), len(selected_queries), len(selected_relevant_docs))

Processing queries: 100%|██████████| 10000/10000 [00:00<00:00, 36231.57it/s]

532 5909 10000 10000





In [3]:
u_selected_queries = dict(sorted(queries.items(), key=lambda item: item[1])[-1000:])

# Lọc corpus và relevant_docs dựa trên selected_queries
u_selected_corpus = {i: corpus[i] for i in u_selected_queries.keys() if i in corpus}
u_selected_relevant_docs = {i: relevant_docs[i] for i in u_selected_queries.keys() if i in relevant_docs}

# Chuẩn bị dataset cho training
val_dataset = prepare_training_dataset_with_triplet(u_selected_queries, u_selected_corpus, u_selected_relevant_docs)
print(len(val_dataset), len(u_selected_corpus), len(u_selected_queries), len(u_selected_relevant_docs))

Processing queries: 100%|██████████| 1000/1000 [00:00<00:00, 671518.41it/s]

4 596 1000 1000





In [5]:
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel

def setup_embedding_model(model_name, new_tokens=None):
    """
    Set up a sentence transformer model with proper tokenizer handling and pooling
    
    Args:
        model_name (str): HuggingFace model name/path
        new_tokens (list): Optional list of new tokens to add to vocabulary
    
    Returns:
        SentenceTransformer: Properly configured sentence transformer model
    """
    # Set up word embedding model
    word_embedding_model = models.Transformer(model_name, max_seq_length= 512)
    tokenizer = word_embedding_model.tokenizer
    
    # Add new tokens if provided
    if new_tokens is not None:
        num_added_tokens = tokenizer.add_tokens(new_tokens, special_tokens=False)
        print(f"Added {num_added_tokens} new tokens to the vocabulary")
        # Resize model embeddings to account for new tokens
        word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer))
    
    # Create pooling model
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False
    )
    
    # Create the full SentenceTransformer model
    sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
    return sentence_model, tokenizer

# Sử dụng hàm:
# 1. Load new tokens
new_tokens = read_json('./src/preprocessor/vocab/data/update_vocab_v2.json')

# 2. Setup model với vocab mới
model, _ = setup_embedding_model('intfloat/multilingual-e5-small', new_tokens= new_tokens)
# model = SentenceTransformer("intfloat/multilingual-e5-small")

Added 13295 new tokens to the vocabulary


In [12]:
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dangvantuan/vietnamese-embedding", trust_remote_code = True)
tokenizer.model_max_length

512

In [7]:
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim as consine


matryoshka_dimensions = [384, 256, 128] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=u_selected_queries,
        corpus=u_selected_corpus,
        relevant_docs=u_selected_relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": consine},
    )
    matryoshka_evaluators.append(ir_evaluator)

evaluator = SequentialEvaluator(matryoshka_evaluators)
# evaluator = InformationRetrievalEvaluator(
#         queries=queries,
#         corpus=corpus,
#         relevant_docs=relevant_docs,
#         name=f"dim_768",
#         truncate_dim= 768,  # Truncate the embeddings to a certain dimension
#         score_functions={"cosine": consine},
#     )

In [10]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [11]:
import torch
import gc
from contextlib import contextmanager
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments

@contextmanager
def track_memory():
    torch.cuda.reset_peak_memory_stats()
    yield
    print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Custom trainer với memory management
class MemoryEfficientTrainer(SentenceTransformerTrainer):
    def training_step(self, *args, **kwargs):
        loss = super().training_step(*args, **kwargs)
        
        # Dọn memory Python và CUDA cache sau mỗi step
        gc.collect()
        torch.cuda.empty_cache()
        
        return loss
        
    def on_epoch_end(self):
        # Dọn memory sau mỗi epoch
        gc.collect()
        torch.cuda.empty_cache()
        super().on_epoch_end()

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="legal_finetuning_v4",
    num_train_epochs = 8,
    per_device_train_batch_size= 4,  # Giảm batch size             
    gradient_accumulation_steps= 4,  # Tăng gradient accumulation            
    per_device_eval_batch_size= 8,
    gradient_checkpointing=True,
    warmup_ratio = 0.1,
    learning_rate= 3e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    save_steps= 32,
    logging_steps = 8,
    save_total_limit = 5,
    load_best_model_at_end=True,
    max_grad_norm = 0.5,
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",
    # resume_from_checkpoint = "./legal_finetuning_v2/checkpoint-128",
    ddp_find_unused_parameters=False,
    dataloader_num_workers = 40
)

# Khởi tạo trainer với custom class
trainer = MemoryEfficientTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

# Dọn cache trước khi training
torch.cuda.empty_cache()

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
# trainer = SentenceTransformerTrainer(
#     model=model,
#     args=args,
#     train_dataset= train_dataset,
#     loss=train_loss,
#     evaluator=evaluator,
# )
# torch.cuda.empty_cache()
# trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
# Training với memory tracking
with track_memory():
    trainer.train()

# Dọn memory sau khi training xong
# gc.collect()
# torch.cuda.empty_cache()

Step,Training Loss,Validation Loss


In [14]:
from huggingface_hub import login
login(token="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", add_to_git_credential=True)
trainer.model.push_to_hub("sup_legal_phbert_triplet")


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/thiendc/.cache/huggingface/token
Login successful


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

model.safetensors:   0%|          | 0.00/560M [00:00<?, ?B/s]

'https://huggingface.co/Tnt3o5/sup_legal_phbert_triplet/commit/9f15d5377c19897d7d359f7dbedf037ab08b14c9'