In [13]:
from src.preprocessor.utils.dataset_level import read_pickle, prepare_training_dataset, read_json
from itertools import islice

corpus = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/corpus.pkl')
corpus = {i: j.replace("\xa0", "") for i, j in corpus.items()}

queries = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/queries.pkl')
relevant_docs = read_pickle('/home/thiendc/projects/legal_retrieval/data/processed/relevant_docs.pkl')
relevant_docs  = dict(islice(relevant_docs.items(), 10000))
train_dataset = prepare_training_dataset(queries, corpus, relevant_docs)
print(len(train_dataset))

Processing queries: 100%|██████████| 10000/10000 [00:00<00:00, 623280.53it/s]

11123





In [2]:
# from sentence_transformers import SentenceTransformer, models
# from transformers import AutoTokenizer, AutoModel
# import os

# def setup_embedding_model(model_name, new_tokens=None):
#     """
#     Set up a sentence transformer model with proper tokenizer handling
    
#     Args:
#         model_name (str): HuggingFace model name/path
#         new_tokens (list): Optional list of new tokens to add to vocabulary
    
#     Returns:
#         SentenceTransformer: Properly configured sentence transformer model
#     """
    
#     # Set up tokenizer and model in SentenceTransformer
#     word_embedding_model = models.Transformer(model_name)
#     tokenizer = word_embedding_model.tokenizer
    
#     # Add new tokens if provided
#     if new_tokens is not None:
#         tokenizer.add_tokens(new_tokens, special_tokens=False)
#         word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer))
    
#     # Create the SentenceTransformer model with the word embedding model
#     sentence_model = SentenceTransformer(modules=[word_embedding_model])
    
#     return sentence_model

# # Load new tokens and setup model
# new_tokens = read_json('./src/preprocessor/vocab/data/update_vocab_v1.json') 
# tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
# tokenizer.add_tokens(new_tokens, special_tokens=False)

# # 2. Create the base model
# base_model = AutoModel.from_pretrained('intfloat/e5-base-v2')
# base_model.resize_token_embeddings(len(tokenizer))

# # 3. Create word embedding model using sentence-transformers format
# word_embedding_model = models.Transformer(
#     model_name_or_path='intfloat/e5-base-v2',
#     tokenizer_name_or_path='intfloat/e5-base-v2'
# )

# # 4. Create pooling model
# pooling_model = models.Pooling(
#     word_embedding_model.get_word_embedding_dimension(),
#     pooling_mode_mean_tokens=True,
#     pooling_mode_cls_token=False,
#     pooling_mode_max_tokens=False
# )

# # 5. Create the full SentenceTransformer model
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# # Now you can use model directly in SentenceTransformerTrainer

from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel

def setup_embedding_model(model_name, new_tokens=None):
    """
    Set up a sentence transformer model with proper tokenizer handling and pooling
    
    Args:
        model_name (str): HuggingFace model name/path
        new_tokens (list): Optional list of new tokens to add to vocabulary
    
    Returns:
        SentenceTransformer: Properly configured sentence transformer model
    """
    # Set up word embedding model
    word_embedding_model = models.Transformer(model_name, max_seq_length= 512)
    tokenizer = word_embedding_model.tokenizer
    
    # Add new tokens if provided
    if new_tokens is not None:
        num_added_tokens = tokenizer.add_tokens(new_tokens, special_tokens=False)
        print(f"Added {num_added_tokens} new tokens to the vocabulary")
        # Resize model embeddings to account for new tokens
        word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer))
    
    # Create pooling model
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False
    )
    
    # Create the full SentenceTransformer model
    sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
    return sentence_model, tokenizer

# Sử dụng hàm:
# 1. Load new tokens
new_tokens = read_json('./src/preprocessor/vocab/data/update_vocab_v2.json')

# 2. Setup model với vocab mới
model, _ = setup_embedding_model('dangvantuan/vietnamese-embedding', new_tokens= new_tokens)
# model = SentenceTransformer("intfloat/multilingual-e5-small")

Added 289 new tokens to the vocabulary


In [3]:
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim as consine


matryoshka_dimensions = [768, 512, 256] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": consine},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)
# evaluator = InformationRetrievalEvaluator(
#         queries=queries,
#         corpus=corpus,
#         relevant_docs=relevant_docs,
#         name=f"dim_768",
#         truncate_dim= 768,  # Truncate the embeddings to a certain dimension
#         score_functions={"cosine": consine},
#     )

In [4]:

from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [5]:
import torch
import gc
from contextlib import contextmanager
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments

@contextmanager
def track_memory():
    torch.cuda.reset_peak_memory_stats()
    yield
    print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Custom trainer với memory management
class MemoryEfficientTrainer(SentenceTransformerTrainer):
    def training_step(self, *args, **kwargs):
        loss = super().training_step(*args, **kwargs)
        
        # Dọn memory Python và CUDA cache sau mỗi step
        gc.collect()
        torch.cuda.empty_cache()
        
        return loss
        
    def on_epoch_end(self):
        # Dọn memory sau mỗi epoch
        gc.collect()
        torch.cuda.empty_cache()
        super().on_epoch_end()

# Training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="legal_finetuning_v2",
    num_train_epochs = 10,
    per_device_train_batch_size= 4,  # Giảm batch size             
    gradient_accumulation_steps= 8,  # Tăng gradient accumulation            
    per_device_eval_batch_size= 8,
    gradient_checkpointing=True,
    warmup_ratio=0.15,
    learning_rate= 2e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    save_steps= 50,
    logging_steps = 5,
    save_total_limit = 5,
    load_best_model_at_end=True,
    max_grad_norm=0.5,
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",
    ddp_find_unused_parameters=False,
    dataloader_num_workers = 40
)

# Khởi tạo trainer với custom class
trainer = MemoryEfficientTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

# Dọn cache trước khi training
torch.cuda.empty_cache()

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
# trainer = SentenceTransformerTrainer(
#     model=model,
#     args=args,
#     train_dataset= train_dataset,
#     loss=train_loss,
#     evaluator=evaluator,
# )
# torch.cuda.empty_cache()
# trainer.train()

In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [8]:
# Training với memory tracking
with track_memory():
    trainer.train()

# Dọn memory sau khi training xong
gc.collect()
torch.cuda.empty_cache()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mthiendc3005[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112966856712269, max=1.0…

Step,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Sequential Score
5,1.2496,No log,0.006982,0.009141,0.009987,0.010606,0.006982,0.003064,0.002014,0.001071,0.006233,0.008032,0.008759,0.009289,0.008006,0.008227,0.007329,0.006806,0.009058,0.009836,0.010481,0.006806,0.003033,0.001982,0.001059,0.006064,0.007955,0.008617,0.009171,0.007873,0.008075,0.007196,0.006563,0.008882,0.009619,0.010313,0.006563,0.002975,0.001939,0.001042,0.00584,0.007854,0.008471,0.009044,0.007709,0.007876,0.007032,0.007032
10,0.9594,No log,0.007425,0.009702,0.010439,0.010908,0.007425,0.003254,0.002105,0.001101,0.006619,0.008542,0.00918,0.009549,0.008407,0.008701,0.007747,0.007333,0.009627,0.010347,0.010841,0.007333,0.003226,0.002084,0.001096,0.006528,0.008484,0.009098,0.009496,0.008321,0.008598,0.007657,0.007283,0.009334,0.010062,0.010724,0.007283,0.003128,0.002029,0.001083,0.006495,0.00823,0.008883,0.009408,0.008217,0.008475,0.007564,0.007564


KeyboardInterrupt: 

In [10]:
# from huggingface_hub import login
# login(token="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", add_to_git_credential=True)
# trainer.model.push_to_hub("test_embedding_model_v2")


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/thiendc/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

'https://huggingface.co/Tnt3o5/test_embedding_model/commit/a6177571d2c5535081604dbd0f8326683b8604f0'