In [None]:
!pip install sentence_transformers datasets accelerate



# base

In [None]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset

  from tqdm.autonotebook import tqdm, trange


In [None]:
corpus = load_dataset("hiieu/legal_eval", split="corpus")
queries = load_dataset("hiieu/legal_eval", split="queries")
relevant_docs_data = load_dataset("hiieu/legal_eval_label", split="train")
# Convert the datasets to dictionaries
corpus = dict(zip(corpus["id"], corpus["text"]))  # Our corpus (cid => document)
queries = dict(zip(queries["id"], queries["text"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["question_id"], relevant_docs_data["corpus_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

model = SentenceTransformer("hiieu/halong_embedding")
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)



In [None]:
# Evaluate the model
results = evaluator(model)
for k,v in results.items():
    print(k, v)

dim_768_cosine_accuracy@1 0.8294209702660407
dim_768_cosine_accuracy@3 0.9233176838810642
dim_768_cosine_accuracy@5 0.9436619718309859
dim_768_cosine_accuracy@10 0.9687010954616588
dim_768_cosine_precision@1 0.8294209702660407
dim_768_cosine_precision@3 0.3145539906103286
dim_768_cosine_precision@5 0.1931142410015649
dim_768_cosine_precision@10 0.09906103286384975
dim_768_cosine_recall@1 0.8145539906103286
dim_768_cosine_recall@3 0.9178403755868545
dim_768_cosine_recall@5 0.9389671361502347
dim_768_cosine_recall@10 0.9640062597809077
dim_768_cosine_ndcg@10 0.8976041381292648
dim_768_cosine_mrr@10 0.879893558884169
dim_768_cosine_map@100 0.8763179130484675
dim_512_cosine_accuracy@1 0.8137715179968701
dim_512_cosine_accuracy@3 0.9233176838810642
dim_512_cosine_accuracy@5 0.9389671361502347
dim_512_cosine_accuracy@10 0.9702660406885759
dim_512_cosine_precision@1 0.8137715179968701
dim_512_cosine_precision@3 0.3145539906103286
dim_512_cosine_precision@5 0.1921752738654147
dim_512_cosine_pr

# Training

In [None]:
import pandas as pd
from datasets import Dataset

def prepare_training_dataset(queries, corpus, relevant_docs):
    anchors = []
    positives = []
    for query_id, docs in relevant_docs.items():
        for doc_id in docs:
          anchors.append(queries[query_id])
          positives.append(corpus[doc_id] )
    df = {
        "anchor": anchors,
        "positive": positives
    }

    return Dataset.from_dict(df)

pairs = prepare_training_dataset(queries, corpus, relevant_docs)
pairs

Dataset({
    features: ['anchor', 'positive'],
    num_rows: 659
})

In [None]:
pairs[0]

{'anchor': 'Khi có sự cố, tai nạn giao thông đường sắt đô thị, doanh nghiệp kinh doanh đường sắt báo ngay tin cho ai?',
 'positive': 'Điều 22. Báo tin và xử lý tin báo về sự cố, tai nạn giao thông đường sắt1. Khi có sự cố, tai nạn xảy ra lái tàu hoặc nhân viên hỗ trợ an toàn phải báo ngay cho nhân viên điều độ chạy tàu.\n2. Nhân viên điều độ chạy tàu phải báo ngay cho những tổ chức, cá nhân sau đây:\na) Các ga hai đầu khu gian;\nb) Doanh nghiệp kinh doanh đường sắt đô thị.\n3. Doanh nghiệp kinh doanh đường sắt đô thị phải báo ngay cho những tổ chức, cá nhân sau đây:\na) Cơ quan công an nơi gần nhất;\nb) Ủy ban nhân dân cấp tỉnh (trong trường hợp xảy ra tai nạn chết người và trong các trường hợp cần sự phối hợp của Ủy ban nhân dân các cấp) và các đơn vị có liên quan.\n4. Doanh nghiệp kinh doanh đường sắt đô thị quy định việc báo tin sự cố giao thông đường sắt gây ra không phải dừng tàu, không bế tắc chính tuyến, không ảnh hưởng đến biểu đồ chạy tàu.\n5. Trường hợp quá thời gian chạy tàu

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]  # Important: large to small
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="sample", # output directory and hugging face model ID
    num_train_epochs=1,                         # number of epochs
    per_device_train_batch_size=8,             # train batch size
    gradient_accumulation_steps=4,             # for a global batch size of 512
    per_device_eval_batch_size=4,              # evaluation batch size
    #gradient_checkpointing=True,
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    #tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",                      # evaluate after each epoch
    #save_strategy="epoch",                      # save after each epoch
    save_steps = 500,
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
)

In [None]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  # training arguments
    train_dataset=pairs,
    loss=train_loss,
    evaluator=evaluator,
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

Step,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
10,0.4373,No log,0.841941,0.945227,0.965571,0.98748,0.841941,0.3229,0.198122,0.101252,0.827856,0.941315,0.962441,0.984351,0.917433,0.898084,0.895053,0.840376,0.954617,0.970266,0.99061,0.840376,0.32603,0.199061,0.101565,0.824726,0.950704,0.967136,0.98748,0.917933,0.897966,0.89426,0.826291,0.948357,0.964006,0.981221,0.826291,0.323944,0.197496,0.100626,0.811424,0.944444,0.960094,0.978091,0.907975,0.887611,0.884443,0.827856,0.937402,0.954617,0.976526,0.827856,0.320292,0.195931,0.100313,0.813772,0.93349,0.951487,0.974178,0.906272,0.886278,0.883776,0.769953,0.915493,0.945227,0.967136,0.769953,0.311424,0.194053,0.099374,0.756651,0.909233,0.942097,0.964789,0.872329,0.844582,0.841999,0.841999
20,0.5033,No log,0.846635,0.957746,0.971831,0.99374,0.846635,0.327074,0.199374,0.101878,0.831768,0.953834,0.968701,0.99061,0.923449,0.904265,0.900389,0.84507,0.959311,0.973396,0.99061,0.84507,0.327595,0.199687,0.101721,0.829421,0.955399,0.970266,0.988263,0.921542,0.9024,0.89862,0.830986,0.951487,0.968701,0.982786,0.830986,0.324987,0.198748,0.100939,0.816119,0.947574,0.965571,0.980438,0.913303,0.893511,0.890536,0.834116,0.949922,0.962441,0.978091,0.834116,0.324465,0.197496,0.100469,0.819249,0.946009,0.959311,0.975743,0.911605,0.892655,0.890146,0.777778,0.924883,0.954617,0.973396,0.777778,0.315597,0.196244,0.100156,0.762911,0.920188,0.952269,0.971831,0.88084,0.853367,0.850407,0.850407


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

# Re-evaluate

In [None]:
from sentence_transformers import SentenceTransformer
import torch
fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

for k,v in results.items():
    print(k, v)

dim_768_cosine_accuracy@1 0.8497652582159625
dim_768_cosine_accuracy@3 0.9577464788732394
dim_768_cosine_accuracy@5 0.971830985915493
dim_768_cosine_accuracy@10 0.9937402190923318
dim_768_cosine_precision@1 0.8497652582159625
dim_768_cosine_precision@3 0.3270735524256651
dim_768_cosine_precision@5 0.19937402190923315
dim_768_cosine_precision@10 0.10187793427230046
dim_768_cosine_recall@1 0.8348982785602503
dim_768_cosine_recall@3 0.9538341158059468
dim_768_cosine_recall@5 0.9687010954616588
dim_768_cosine_recall@10 0.9906103286384976
dim_768_cosine_ndcg@10 0.9249410200836153
dim_768_cosine_mrr@10 0.90623431949723
dim_768_cosine_map@100 0.9023685432985649
dim_512_cosine_accuracy@1 0.8482003129890454
dim_512_cosine_accuracy@3 0.9577464788732394
dim_512_cosine_accuracy@5 0.97339593114241
dim_512_cosine_accuracy@10 0.9906103286384976
dim_512_cosine_precision@1 0.8482003129890454
dim_512_cosine_precision@3 0.3270735524256651
dim_512_cosine_precision@5 0.19968701095461652
dim_512_cosine_prec

# Push model to hub

In [None]:
# from huggingface_hub import login

# login(token="", add_to_git_credential=True)  # ADD YOUR TOKEN HERE

In [None]:
# push model to hub
#trainer.model.push_to_hub("miai-sample-embedding")