In [1]:
%pip install sentence_transformers datasets accelerate

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import random


import datasets
from datasets import load_dataset, Dataset, Features, Value
import torch
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:

print(device)

cuda


In [4]:
corpus_features = Features({
    'corpus_id' : Value("string"),
    'content' : Value("string")
})

question_features = Features({
    'question_id' : Value("string"),
    'question' : Value("string")
})

qnc_features = Features({
    'question_id' : Value("string"),
    'corpus_id' : Value("string")
})

In [5]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import  SequentialEvaluator, InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

In [6]:
dataset_dir = "/kaggle/input/vn-law-embedding/data_train/"
output_dir = "/kaggle/working/"
basemodel_path = "hiieu/halong_embedding"

In [7]:
corpus1 = load_dataset("csv", data_files=dataset_dir + "corpus.csv", features = corpus_features)["train"]
queries1 = load_dataset("csv", data_files=dataset_dir + "questions.csv", features = question_features)["train"]
relevant_docs_data1 = load_dataset("csv", data_files=dataset_dir + "qnc.csv", features = qnc_features)["train"]
corpus2 = load_dataset("csv", data_files=dataset_dir + "corpus2.csv", features = corpus_features)["train"]
queries2 = load_dataset("csv", data_files=dataset_dir + "questions2.csv", features = question_features)["train"]
relevant_docs_data2 = load_dataset("csv", data_files=dataset_dir + "qnc2.csv", features = qnc_features)["train"]

corpus = datasets.concatenate_datasets([corpus1,corpus2]).shuffle(seed = 7)
queries = datasets.concatenate_datasets([queries1,queries2]).shuffle(seed = 7)
relevant_docs_data = datasets.concatenate_datasets([relevant_docs_data1,relevant_docs_data2]).shuffle(seed = 7)

# Convert the datasets to dictionaries
corpus = dict(zip(corpus["corpus_id"], corpus["content"]))  # Our corpus (cid => document)
queries = dict(zip(queries["question_id"], queries["question"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["question_id"], relevant_docs_data["corpus_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
print(len(corpus))
print(len(queries))
print(len(relevant_docs_data))

7052
16308
27538


In [9]:
model = SentenceTransformer(basemodel_path)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [10]:
matryoshka_dimensions = [768, 512, 256, 128] 
matryoshka_evaluators = []


eval_set  = dict()
n = len(relevant_docs_data)


for i,(k,v) in enumerate(relevant_docs.items()):
    if(i == 1000):
        break
    eval_set[k] = v


evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=eval_set,
        name=f"dim_128",
        truncate_dim=128, 
        score_functions={"cosine": cos_sim},
    )


In [11]:
# Evaluate the model
print(len(eval_set))

results = evaluator(model)
for k,v in results.items():
    print(k, v)

1000
dim_128_cosine_accuracy@1 0.39
dim_128_cosine_accuracy@3 0.6
dim_128_cosine_accuracy@5 0.666
dim_128_cosine_accuracy@10 0.748
dim_128_cosine_precision@1 0.39
dim_128_cosine_precision@3 0.2786666666666666
dim_128_cosine_precision@5 0.21140000000000003
dim_128_cosine_precision@10 0.136
dim_128_cosine_recall@1 0.22244999999999998
dim_128_cosine_recall@3 0.41981666666666667
dim_128_cosine_recall@5 0.5078333333333334
dim_128_cosine_recall@10 0.6213333333333333
dim_128_cosine_ndcg@10 0.49575154970089774
dim_128_cosine_mrr@10 0.5108035714285711
dim_128_cosine_map@100 0.435530044206497


In [12]:
train_set  = dict()
n = len(relevant_docs_data)


for i,(k,v) in enumerate(relevant_docs.items()):
    if(i >= 1000):
      train_set[k] = v

In [13]:


def get_training_dataset(queries, corpus, relevant_docs):

    dataset_dict = {"anchors": [], "positives": []}

    for i, (query_id, docs) in enumerate(relevant_docs.items()):
        for doc_id in docs:
          dataset_dict["anchors"].append(queries[query_id])
          dataset_dict["positives"].append(corpus[doc_id] )

    return Dataset.from_dict(dataset_dict)



In [14]:
training_dataset = get_training_dataset(queries, corpus, train_set)

training_dataset[0]

{'anchors': 'Hiện tại tôi có cho 1 khách hàng thuê nhà, thời hạn là 5năm, trả tiền trước 1 năm, những năm còn lại thì trả theo tháng, hiện giờ Khách hàng \xa0đó thuê được 2 năm 8 tháng rồi, nhưng do gia đình kinh tế khó khăn nên bán đất và nhà.và trong hợp đồng cho thuê "KHÔNG" có ghi nếu 2 bên chấm dứt hợp đồng trước thời hạn thì bên chấm dứt trước sẽ bồi thường thì trong hợp đồng không có ghi gì cả. và gia đình tôi cũng đã thông báo cho Khách Hàng đó trước 1 tháng để di dời rồi. thấy Khách hàng cũng khó khăn nên chúng tôi có nói sẽ phụ chi phí di dời giúp cho Khách hàng đó.cả hai đã chịu và thỏa thuận như vậy. nhưng mà bây giờ khách hàng đó không chịu di dời, mà còn bắt gia đình chúng tôi phải bồi thường 100 triệu cho người đó, nếu để lâu sẽ tăng giá lên nữa. hiện gia đình chúng tôi có đưa đơn lên công an, nhưng không thỏa thuận được, Khách Hàng đó bây giờ không chịu di dời nữa, và đòi ở lại đến hết thời gian trong hợp đồng luôn mới chịu đi. mà ngày giao đất và nhà cũng gần đến rồi. 

In [15]:
inner_loss = MultipleNegativesRankingLoss(model)

loss = MatryoshkaLoss(model, inner_loss, matryoshka_dims=matryoshka_dimensions)

In [16]:

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir=output_dir,
    
    num_train_epochs=5,                        
    bf16=True,                                  
    
    per_device_train_batch_size=4,             
    per_device_eval_batch_size=4,             
    gradient_accumulation_steps=4,           
    
    warmup_ratio=0.1,                           
    learning_rate=3e-5,                        
    lr_scheduler_type="cosine",                 
    optim="adamw_torch_fused",                 
    batch_sampler=BatchSamplers.NO_DUPLICATES,  #No duplicate is good for Multi Negative Ranking Loss
    
    eval_strategy="steps",                                      
    metric_for_best_model="eval_dim_128_cosine_accuracy@3",  # best score 128 dimension
    load_best_model_at_end=True,              
    
    logging_steps= 197*2,                         
    save_steps = 394*2,
    save_total_limit=5,                      
)

In [17]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  
    train_dataset=training_dataset,
    loss=loss,
    evaluator=evaluator,
)

In [18]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100
394,0.603,No log,0.549,0.756,0.811,0.878,0.549,0.386333,0.2876,0.1779,0.310933,0.561833,0.66605,0.788833,0.657004,0.660737,0.592677
788,0.4399,No log,0.56,0.746,0.818,0.9,0.56,0.378333,0.2838,0.1759,0.318217,0.55895,0.666833,0.797267,0.659567,0.670787,0.590958
1182,0.3574,No log,0.568,0.754,0.822,0.88,0.568,0.388333,0.2922,0.1806,0.3188,0.572,0.680433,0.8001,0.669718,0.674764,0.604077
1576,0.188,No log,0.612,0.782,0.839,0.893,0.612,0.405667,0.301,0.1819,0.3456,0.594867,0.701033,0.809333,0.692713,0.70868,0.630174
1970,0.1884,No log,0.595,0.771,0.829,0.89,0.595,0.398333,0.2998,0.181,0.335483,0.588017,0.6963,0.80775,0.684041,0.693585,0.621476
2364,0.0955,No log,0.609,0.785,0.846,0.897,0.609,0.409667,0.3046,0.185,0.342633,0.5998,0.7093,0.822017,0.698457,0.708148,0.634634
2758,0.1135,No log,0.612,0.78,0.835,0.901,0.612,0.409667,0.3026,0.1854,0.344683,0.60355,0.706167,0.828517,0.701935,0.707945,0.637297
3152,0.0483,No log,0.623,0.792,0.851,0.9,0.623,0.412,0.31,0.1841,0.353033,0.60835,0.7222,0.82345,0.705799,0.716647,0.644872
3546,0.0803,No log,0.612,0.787,0.849,0.899,0.612,0.410667,0.308,0.1838,0.348683,0.604833,0.71595,0.823583,0.702773,0.710559,0.640792
3940,0.0322,No log,0.61,0.784,0.85,0.901,0.61,0.411667,0.3088,0.1841,0.348317,0.6036,0.719167,0.825,0.703221,0.709789,0.641178


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [21]:
!zip "model.zip" -r "/kaggle/working/checkpoint-3940"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/checkpoint-3940/ (stored 0%)
  adding: kaggle/working/checkpoint-3940/sentence_bert_config.json (deflated 4%)
  adding: kaggle/working/checkpoint-3940/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/checkpoint-3940/2_Normalize/ (stored 0%)
  adding: kaggle/working/checkpoint-3940/training_args.bin (deflated 52%)
  adding: kaggle/working/checkpoint-3940/config.json (deflated 49%)
  adding: kaggle/working/checkpoint-3940/rng_state.pth (deflated 25%)
  adding: kaggle/working/checkpoint-3940/scheduler.pt (deflated 57%)
  adding: kaggle/working/checkpoint-3940/tokenizer.json (deflated 76%)
  adding: kaggle/working/checkpoint-3940/model.safetensors (deflated 27%)
  adding: kaggle/working/checkpoint-3940/config_sentence_transformers.json (deflated 36%)
  adding: kaggle/working/checkpoint-3940/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/checkpoint-3940/modules.json (deflated 62%)
  adding: kaggle/working/checkpoint-3940/README.md (defl

In [22]:
final_model = SentenceTransformer(output_dir)

In [23]:
final_results = evaluator(final_model)
for k,v in final_results.items():
    print(k, v)

dim_128_cosine_accuracy@1 0.623
dim_128_cosine_accuracy@3 0.792
dim_128_cosine_accuracy@5 0.851
dim_128_cosine_accuracy@10 0.9
dim_128_cosine_precision@1 0.623
dim_128_cosine_precision@3 0.412
dim_128_cosine_precision@5 0.31
dim_128_cosine_precision@10 0.1841
dim_128_cosine_recall@1 0.3530333333333333
dim_128_cosine_recall@3 0.6083500000000001
dim_128_cosine_recall@5 0.7222
dim_128_cosine_recall@10 0.82345
dim_128_cosine_ndcg@10 0.705799324596211
dim_128_cosine_mrr@10 0.7166472222222228
dim_128_cosine_map@100 0.644872445702585
