In [3]:
from src.preprocessor.utils.dataset_level import read_pickle, prepare_training_dataset, read_json

corpus = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/corpus.pkl')
corpus =  dict(sorted(corpus.items(), key=lambda item: item[1])[:6000])
corpus = {i: j.replace("\xa0", "") for i, j in corpus.items()}

queries = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/queries.pkl')
relevant_docs = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/relevant_docs.pkl')
train_dataset = prepare_training_dataset(queries, corpus, relevant_docs)
print(len(train_dataset))

Processing queries: 100%|██████████| 119456/119456 [00:00<00:00, 755357.66it/s]


11308


In [4]:
# from sentence_transformers import SentenceTransformer, models
# from transformers import AutoTokenizer, AutoModel
# import os

# def setup_embedding_model(model_name, new_tokens=None):
#     """
#     Set up a sentence transformer model with proper tokenizer handling
    
#     Args:
#         model_name (str): HuggingFace model name/path
#         new_tokens (list): Optional list of new tokens to add to vocabulary
    
#     Returns:
#         SentenceTransformer: Properly configured sentence transformer model
#     """
    
#     # Set up tokenizer and model in SentenceTransformer
#     word_embedding_model = models.Transformer(model_name)
#     tokenizer = word_embedding_model.tokenizer
    
#     # Add new tokens if provided
#     if new_tokens is not None:
#         tokenizer.add_tokens(new_tokens, special_tokens=False)
#         word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer))
    
#     # Create the SentenceTransformer model with the word embedding model
#     sentence_model = SentenceTransformer(modules=[word_embedding_model])
    
#     return sentence_model

# # Load new tokens and setup model
# new_tokens = read_json('./src/preprocessor/vocab/data/update_vocab_v1.json') 
# tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
# tokenizer.add_tokens(new_tokens, special_tokens=False)

# # 2. Create the base model
# base_model = AutoModel.from_pretrained('intfloat/e5-base-v2')
# base_model.resize_token_embeddings(len(tokenizer))

# # 3. Create word embedding model using sentence-transformers format
# word_embedding_model = models.Transformer(
#     model_name_or_path='intfloat/e5-base-v2',
#     tokenizer_name_or_path='intfloat/e5-base-v2'
# )

# # 4. Create pooling model
# pooling_model = models.Pooling(
#     word_embedding_model.get_word_embedding_dimension(),
#     pooling_mode_mean_tokens=True,
#     pooling_mode_cls_token=False,
#     pooling_mode_max_tokens=False
# )

# # 5. Create the full SentenceTransformer model
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# # Now you can use model directly in SentenceTransformerTrainer

from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel

def setup_embedding_model(model_name, new_tokens=None):
    """
    Set up a sentence transformer model with proper tokenizer handling and pooling
    
    Args:
        model_name (str): HuggingFace model name/path
        new_tokens (list): Optional list of new tokens to add to vocabulary
    
    Returns:
        SentenceTransformer: Properly configured sentence transformer model
    """
    # Set up word embedding model
    word_embedding_model = models.Transformer(model_name)
    tokenizer = word_embedding_model.tokenizer
    
    # Add new tokens if provided
    if new_tokens is not None:
        num_added_tokens = tokenizer.add_tokens(new_tokens, special_tokens=False)
        print(f"Added {num_added_tokens} new tokens to the vocabulary")
        # Resize model embeddings to account for new tokens
        word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer))
    
    # Create pooling model
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False
    )
    
    # Create the full SentenceTransformer model
    sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
    return sentence_model, tokenizer

# Sử dụng hàm:
# 1. Load new tokens
new_tokens = read_json('./src/preprocessor/vocab/data/update_vocab_v2.json')

# 2. Setup model với vocab mới
model, _ = setup_embedding_model('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base', new_tokens= new_tokens)
# model = SentenceTransformer("intfloat/multilingual-e5-small")

Added 6461 new tokens to the vocabulary


In [5]:
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim as consine


matryoshka_dimensions = [768, 512, 256] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
# for dim in matryoshka_dimensions:
#     ir_evaluator = InformationRetrievalEvaluator(
#         queries=queries,
#         corpus=corpus,
#         relevant_docs=relevant_docs,
#         name=f"dim_{dim}",
#         truncate_dim=dim,  # Truncate the embeddings to a certain dimension
#         score_functions={"cosine": consine},
#     )
#     matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
# evaluator = SequentialEvaluator(matryoshka_evaluators)
evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_768",
        truncate_dim= 768,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": consine},
    )

In [6]:

from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [7]:
import torch
import gc
from contextlib import contextmanager
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments

@contextmanager
def track_memory():
    torch.cuda.reset_peak_memory_stats()
    yield
    print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Custom trainer với memory management
class MemoryEfficientTrainer(SentenceTransformerTrainer):
    def training_step(self, *args, **kwargs):
        loss = super().training_step(*args, **kwargs)
        
        # Dọn memory Python và CUDA cache sau mỗi step
        gc.collect()
        torch.cuda.empty_cache()
        
        return loss
        
    def on_epoch_end(self):
        # Dọn memory sau mỗi epoch
        gc.collect()
        torch.cuda.empty_cache()
        super().on_epoch_end()

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="legal_finetuning",
    num_train_epochs = 1,
    per_device_train_batch_size= 16,  # Giảm batch size             
    gradient_accumulation_steps=8,  # Tăng gradient accumulation            
    per_device_eval_batch_size= 16,
    gradient_checkpointing=True,
    warmup_ratio=0.15,
    learning_rate=5e-6,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    save_steps= 20,
    logging_steps = 5,
    save_total_limit=5,
    load_best_model_at_end=True,
    max_grad_norm=0.5,
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",
    ddp_find_unused_parameters=False,
    dataloader_num_workers = 40
)

# Khởi tạo trainer với custom class
trainer = MemoryEfficientTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

# Dọn cache trước khi training
torch.cuda.empty_cache()

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
# trainer = SentenceTransformerTrainer(
#     model=model,
#     args=args,
#     train_dataset= train_dataset,
#     loss=train_loss,
#     evaluator=evaluator,
# )
# torch.cuda.empty_cache()
# trainer.train()

In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [8]:
# Training với memory tracking
with track_memory():
    trainer.train()

# Dọn memory sau khi training xong
gc.collect()
torch.cuda.empty_cache()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mthiendc3005[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112643134159347, max=1.0…

Step,Training Loss,Validation Loss


In [10]:
from huggingface_hub import login
login(token="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", add_to_git_credential=True)
trainer.model.push_to_hub("test_embedding_model")


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/thiendc/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

'https://huggingface.co/Tnt3o5/test_embedding_model/commit/a6177571d2c5535081604dbd0f8326683b8604f0'