In [1]:
import pickle
from src.preprocessor.utils.dataset_level import prepare_training_dataset
with open('./data/processed/queries.pkl', 'rb') as f:
    queries = pickle.load(f)
with open('./data/processed/corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)
with open('./data/processed/relevant_docs.pkl', 'rb') as f:
    relevant_docs = pickle.load(f)

pairs = prepare_training_dataset(queries, corpus, relevant_docs)

Processing queries: 100%|██████████| 119456/119456 [00:00<00:00, 796526.02it/s]


Lỗi KeyError: '145694' - Bỏ qua query_id: 75151, doc_id: 145694
Lỗi KeyError: '156611' - Bỏ qua query_id: 154595, doc_id: 156611
Lỗi KeyError: '78193' - Bỏ qua query_id: 14988, doc_id: 78193
Lỗi KeyError: '121737' - Bỏ qua query_id: 53808, doc_id: 121737
Lỗi KeyError: '104474' - Bỏ qua query_id: 175694, doc_id: 104474
Lỗi KeyError: '6115' - Bỏ qua query_id: 157856, doc_id: 6115
Lỗi KeyError: '61435' - Bỏ qua query_id: 10, doc_id: 61435
Lỗi KeyError: '19478' - Bỏ qua query_id: 123964, doc_id: 19478
Lỗi KeyError: '215415' - Bỏ qua query_id: 137473, doc_id: 215415
Lỗi KeyError: '111348' - Bỏ qua query_id: 44532, doc_id: 111348
Lỗi KeyError: '120847' - Bỏ qua query_id: 108391, doc_id: 120847
Lỗi KeyError: '193554' - Bỏ qua query_id: 117949, doc_id: 193554
Lỗi KeyError: '85594' - Bỏ qua query_id: 115534, doc_id: 85594
Lỗi KeyError: '78193' - Bỏ qua query_id: 154692, doc_id: 78193
Lỗi KeyError: '19478' - Bỏ qua query_id: 172715, doc_id: 19478
Lỗi KeyError: '77193' - Bỏ qua query_id: 15895, d

In [2]:
import json
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim as consine
from datasets import load_dataset, concatenate_datasets

model = SentenceTransformer("NghiemAbe/Vi-Legal-Bi-Encoder-v2")
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": consine},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [3]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]  # Important: large to small
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [4]:
import wandb
wandb.login(key="02ba155e26496a78f062f683274330566fefe94c")
wandb.init(
    project="sentence_sim",  # Đặt tên project phù hợp
    name="experiment-v1",
    config={
        # Training hyperparameters
        "model_name": "base_model_name",  # tên model base bạn dùng
        "learning_rate": 2e-5,
        "epochs": 50,
        "per_device_batch_size": 4,
        "effective_batch_size": 4 * 8 * 4,  # batch_size * gradient_accum * num_gpus
        "warmup_ratio": 0.1,
        "optimizer": "adamw_torch_fused",
        
        # Model architecture
        "embedding_dim": 768,  # dựa trên metric của bạn
        
        # Dataset info
        "train_dataset_size": None,  # số lượng training samples
        "eval_dataset_size": None,   # số lượng validation samples
        
        # Hardware config
        "num_gpus": 4,
        "gpu_type": "RTX 2080Ti",
        "gradient_checkpointing": True,
        "fp16": True,
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mthiendc3005[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/thiendc/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112952222012812, max=1.0…

In [5]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# torch.distributed.init_process_group(backend='nccl')
# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="output_dir",
    num_train_epochs=50,
    # 2080Ti has 11GB VRAM, reduced batch size for multi-GPU training
    per_device_train_batch_size=4,             # reduced from 8 to 4
    gradient_accumulation_steps=8,             # increased to maintain effective batch size
    per_device_eval_batch_size=4,
    gradient_checkpointing=True,               # enabled to save memory
    warmup_ratio=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",                       # changed from fused to regular adamw for better compatibility
    fp16=True,                                 # keep fp16 for memory efficiency
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    save_steps=500,
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",
    # Thêm wandb config
    report_to=["wandb"],          # Enable wandb logging
    run_name=wandb.run.name      # Sử dụng tên run từ wandb
)

from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  # training arguments
    train_dataset=pairs,
    loss=train_loss,
    evaluator=evaluator,
)

trainer.train()

TypeError: SentenceTransformerTrainingArguments.__init__() got an unexpected keyword argument 'early_stopping_patience'