In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install sentence_transformers datasets accelerate

In [None]:
import random

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import  SequentialEvaluator, InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
import datasets
from datasets import load_dataset, Dataset, Features, Value
import torch
import torch.nn.functional as F

In [None]:
corpus_features = Features({
    'corpus_id' : Value("string"),
    'content' : Value("string")
})

question_features = Features({
    'question_id' : Value("string"),
    'question' : Value("string")
})

qnc_features = Features({
    'question_id' : Value("string"),
    'corpus_id' : Value("string")
})

In [None]:
dataset_dir = "/content/drive/MyDrive/Law_Assisstant/data_train/"
output_dir = "/content/drive/MyDrive/Law_Assisstant/model"
basemodel_path = "hiieu/halong_embedding"

In [None]:
corpus1 = load_dataset("csv", data_files=dataset_dir + "corpus.csv", features = corpus_features)["train"]
queries1 = load_dataset("csv", data_files=dataset_dir + "questions.csv", features = question_features)["train"]
relevant_docs_data1 = load_dataset("csv", data_files=dataset_dir + "qnc.csv", features = qnc_features)["train"]
corpus2 = load_dataset("csv", data_files=dataset_dir + "corpus2.csv", features = corpus_features)["train"]
queries2 = load_dataset("csv", data_files=dataset_dir + "questions2.csv", features = question_features)["train"]
relevant_docs_data2 = load_dataset("csv", data_files=dataset_dir + "qnc2.csv", features = qnc_features)["train"]

corpus = datasets.concatenate_datasets([corpus1,corpus2]).shuffle(seed = 7)
queries = datasets.concatenate_datasets([queries1,queries2]).shuffle(seed = 7)
relevant_docs_data = datasets.concatenate_datasets([relevant_docs_data1,relevant_docs_data2]).shuffle(seed = 7)

# Convert the datasets to dictionaries
corpus = dict(zip(corpus["corpus_id"], corpus["content"]))  # Our corpus (cid => document)
queries = dict(zip(queries["question_id"], queries["question"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["question_id"], relevant_docs_data["corpus_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [None]:
print(len(corpus))
print(len(queries))
print(len(relevant_docs_data))

In [None]:
model = SentenceTransformer(basemodel_path)


In [None]:
matryoshka_dimensions = [768, 512, 256, 128] 
matryoshka_evaluators = []


eval_set  = dict()
n = len(relevant_docs_data)


for i,(k,v) in enumerate(relevant_docs.items()):
    if(i == 1000):
        break
    eval_set[k] = v


evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=eval_set,
        name=f"dim_128",
        truncate_dim=128, 
        score_functions={"cosine": cos_sim},
    )


In [None]:
# Evaluate the model
print(len(eval_set))

results = evaluator(model)
for k,v in results.items():
    print(k, v)

In [None]:
train_set  = dict()
n = len(relevant_docs_data)


for i,(k,v) in enumerate(relevant_docs.items()):
    if(i >= 1000):
      train_set[k] = v

In [None]:


def get_training_dataset(queries, corpus, relevant_docs):

    dataset_dict = {"anchors": [], "positives": []}

    for i, query_id, docs in enumerate(relevant_docs.items()):
        for doc_id in docs:
          dataset_dict["anchors"].append(queries[query_id])
          dataset_dict["positives"].append(corpus[doc_id] )

    return Dataset.from_dict(dataset_dict)



In [None]:
training_dataset = get_training_dataset(queries, corpus, relevant_docs)

training_dataset[0]

In [None]:
inner_loss = MultipleNegativesRankingLoss(model)

loss = MatryoshkaLoss(model, inner_loss, matryoshka_dims=matryoshka_dimensions)

In [None]:

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir=output_dir,
    
    num_train_epochs=5,                        
    bf16=True,                                  
    
    per_device_train_batch_size=8,             
    per_device_eval_batch_size=4,             
    gradient_accumulation_steps=4,           
    
    warmup_ratio=0.1,                           
    learning_rate=3e-5,                        
    lr_scheduler_type="cosine",                 
    optim="adamw_torch_fused",                 
    batch_sampler=BatchSamplers.NO_DUPLICATES,  #No duplicate is good for Multi Negative Ranking Loss
    
    eval_strategy="steps",                                      
    metric_for_best_model="eval_dim_128_cosine_accuracy@3",  # best score 128 dimension
    load_best_model_at_end=True,              
    
    logging_steps= 215,                         
    save_steps = 860,
    save_total_limit=5,                      
)

In [None]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  
    train_dataset=training_dataset,
    loss=loss,
    evaluator=evaluator,
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()