In [1]:
!git clone https://github.com/phongtintruong/ViNLU.git

Cloning into 'ViNLU'...
remote: Enumerating objects: 1207, done.[K
remote: Counting objects: 100% (192/192), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 1207 (delta 109), reused 154 (delta 79), pack-reused 1015 (from 1)[K
Receiving objects: 100% (1207/1207), 4.16 MiB | 10.21 MiB/s, done.
Resolving deltas: 100% (765/765), done.


In [None]:
def read_file(file_path):
    """Reads a file and returns a list of lines"""
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()
    return lines



In [None]:
def create_triplets_for_anchors(sentences, labels, anchor_ids, num_positive=8, num_negative=8):
    """Creates corpus and train_triplets for given anchor_ids.

    - anchor_ids are the indices (starting from 0) of the sentences to use as anchors.
    - Each triplet consists of (anchor, positive sample, negative sample).
    - Each anchor should have 'num_positive' positive samples and 'num_negative' negative samples.
    """
    from collections import defaultdict
    import random

    # Create a dictionary to hold sentences by label
    label_to_sentences = defaultdict(list)
    for idx, (sentence, label) in enumerate(zip(sentences, labels)):
        label_to_sentences[label].append(str(idx))  # Store sentence index (starting from 0)

    # Create corpus
    corpus = {str(idx): sentence for idx, sentence in enumerate(sentences)}

    # Create triplets for specific anchors
    train_triplets = []
    for anchor_id in anchor_ids:
        anchor = str(anchor_id)
        label = labels[anchor_id]

        # Collect positive samples
        positive_samples = [x for x in label_to_sentences[label] if x != anchor]
        if len(positive_samples) >= num_positive:
            positive_samples = random.sample(positive_samples, num_positive)

        # Collect negative samples
        negative_samples = []
        for other_label, other_indices in label_to_sentences.items():
            if other_label != label:
                negative_samples += other_indices
        negative_samples = random.sample(negative_samples, num_negative)

        # Create triplets (anchor, positive, negative)
        for positive, negative in zip(positive_samples, negative_samples):
            train_triplets.append([anchor, positive, negative])

    return corpus, train_triplets


In [None]:
# Reading the input files
sentences = read_file('/content/ViNLU/Hackathon/word-level/train/seq.in')  # File with sentences
labels = read_file('/content/ViNLU/Hackathon/word-level/train/label')  # File with labels

anchor_ids = [1787, 1139,  856, 1057, 1773, 1776, 1047, 1049, 1080, 1780, 1789,
       1038,  321, 1767, 1781, 1765, 1766, 1782, 1115, 1761, 1769, 1763,
       1171, 1760, 1086, 1174, 1759, 1775, 1770,   68,  174, 1121,  204,
        917, 1774, 1784,   45, 1077,  326,  325, 1099,  199, 1114, 1762,
         81,  491,  753,  235, 1053, 1145,  226,  268,  188,  203, 1786,
       1423,  590, 1778,  217, 1783,  234, 1098, 1777,  340,  253, 1434,
       1737,  784, 1764,  369,  466, 1772, 1768,  260, 1525, 1771, 1785,
        186, 1779,  299,  464,  191, 1572, 1152,  248,  231,  277, 1595,
        307,  354, 1714,   55,  357, 1721, 1088, 1657,  227,  267,  410,
       1620,  313,  270,  289,  290, 1234,  193,  258, 1788, 1666, 1448,
        209, 1634, 1364, 1645,  773,  237,  489,  271, 1147,   42,  276,
       1363,  212, 1074, 1693, 1727, 1350,  272,  298,  322,  339,  345,
        365,  216, 1417, 1715, 1066, 1632,  206, 1325,  265,  310, 1675,
        350, 1135,  317,  910, 1638,  233, 1111, 1656,  245, 1215,  372,
       1132,   50, 1305,  283, 1048, 1425,  318,  228,  567, 1122, 1266,
        408,  219, 1030,  255, 1308, 1538,  269, 1288,   33,  116, 1181,
        207, 1150, 1282,  215, 1235,  273,  316, 1754,  189, 1706,  224,
       1042, 1210, 1241,  352, 1209, 1751,   18,  229, 1094, 1162,  256,
        314, 1079, 1299, 1281,  361, 1748,  304, 1198, 1034, 1107,  291,
        275, 1065,   53, 1685, 1225,  278,  241, 1245, 1315, 1324, 1649,
        327, 1615,   58, 1321, 1463, 1224, 1297, 1681,  311, 1274, 1367,
       1580, 1118,  347, 1505, 1635, 1653,  218, 1327, 1510,  344, 1043,
       1261, 1399, 1598, 1734,  712, 1143,  320,   29, 1286, 1170,  985]  # Sample anchor IDs, you can modify this

# Generate the corpus and train triplets
corpus, train_triplets = create_triplets_for_anchors(sentences, labels, anchor_ids, 16, 16)


In [None]:
print(corpus)

{'0': 'tăng bóng 3 26 phần_trăm', '1': 'hãy tăng thêm độ sáng phòng ngủ con trai lên 91 phần_trăm', '2': 'giúp mình tăng đèn âm trần thứ 2 lên 10 phần_trăm ở phòng con_nhỏ 4 tầng 5 nhé', '3': 'bạn có_thể tăng giúp mình bóng chùm thứ 3 lên mức 21 phần_trăm được không', '4': 'tăng bình_nóng_lạnh 5 phần_trăm', '5': 'mình muốn tăng bóng vách', '6': 'tăng đèn cảnh lên 23 phần_trăm ở khách 2 6', '7': 'bạn có_thể giúp mình tăng mức_độ của đèn màu bên phòng giặt ủi 4 lên 8 phần_trăm được không', '8': 'bạn tăng đèn hắt tường thứ 1 ở phòng sách 5 lên 13 phần_trăm giúp mình nhé', '9': 'tăng đèn treo tường lên 7 phần_trăm giúp mình', '10': 'hãy tăng giúp ta cái nóng lạnh 4 ở phòng trẻ_em 5', '11': 'tăng thêm độ sáng bóng trụ cổng 3 lễ_tân 4 lên 82 phần_trăm', '12': 'tăng điện thứ 1 lên 5 phần_trăm giúp mình', '13': 'tăng hắt trần thứ 2 lên 22 phần_trăm', '14': 'tăng bóng kiểng thứ 3 lên 1 phần_trăm', '15': 'tăng thêm ánh_sáng bóng hắt tường 2 lên mức 51 phần_trăm', '16': 'bạn tăng bóng led 3 ở phò

In [None]:
print(train_triplets)

[['1787', '1770', '493'], ['1787', '1764', '921'], ['1787', '1786', '1194'], ['1787', '1784', '669'], ['1787', '1771', '485'], ['1787', '1767', '1700'], ['1787', '1774', '1478'], ['1787', '1766', '457'], ['1787', '1789', '233'], ['1787', '1759', '860'], ['1787', '1762', '603'], ['1787', '1779', '940'], ['1787', '1778', '1025'], ['1787', '1765', '1163'], ['1787', '1763', '248'], ['1787', '1761', '1061'], ['1139', '1140', '928'], ['1139', '1028', '1552'], ['1139', '1046', '1611'], ['1139', '1148', '472'], ['1139', '1184', '1769'], ['1139', '1058', '1278'], ['1139', '1185', '101'], ['1139', '1094', '613'], ['1139', '1173', '661'], ['1139', '1142', '1685'], ['1139', '1164', '378'], ['1139', '1055', '1778'], ['1139', '1146', '1399'], ['1139', '1176', '668'], ['1139', '1099', '1738'], ['1139', '1105', '1420'], ['856', '959', '1427'], ['856', '976', '1548'], ['856', '838', '750'], ['856', '994', '171'], ['856', '1023', '345'], ['856', '889', '1535'], ['856', '946', '741'], ['856', '993', '136

In [None]:
print(len(train_triplets))

4048


In [None]:
from collections import defaultdict

def create_full_dev_queries_and_rel_docs(sentences, labels):
    """Creates full dev_queries and dev_rel_docs from sentences and labels.

    - dev_queries: Uses all sentences as queries.
    - dev_rel_docs: For each query, creates a set of related document IDs that share the same label.
    """
    # Create a dictionary to hold sentences by label
    label_to_sentences = defaultdict(list)
    for idx, (sentence, label) in enumerate(zip(sentences, labels)):
        label_to_sentences[label].append(str(idx))  # Store sentence index (starting from 0)

    # Create dev_queries and dev_rel_docs
    dev_queries = {}
    dev_rel_docs = {}

    for idx, sentence in enumerate(sentences):
        query_id = str(idx)
        dev_queries[query_id] = sentence  # Each sentence is a query

        # Find all related documents with the same label
        query_label = labels[idx]
        related_docs = label_to_sentences[query_label]  # Find all docs with the same label
        dev_rel_docs[query_id] = set(related_docs)

    return dev_queries, dev_rel_docs


In [None]:
# Reading the input files
sentences = read_file('/content/ViNLU/Hackathon/word-level/dev/seq.in')  # File with sentences
labels = read_file('/content/ViNLU/Hackathon/word-level/dev/label')  # File with labels

# Generate dev_queries and dev_rel_docs
dev_queries, dev_rel_docs = create_full_dev_queries_and_rel_docs(sentences, labels)


In [None]:
print(dev_queries)

{'0': 'anh ơi thiết_bị là đầy_đủ rgb 4', '1': 'chào giảm giúp tôi đèn hoa trong phòng con', '2': 'làm_ơn hạ xuống đỡ đèn treo tường thứ 4 1 phần_trăm giúp tôi', '3': 'tôi muốn giảm độ sáng của đèn ngủ 2', '4': 'nút ấn 3 to quá giảm còn 18 phần_trăm giúp tôi', '5': 'tôi muốn giảm độ sáng của bóng hoa 2', '6': 'chào bạn kiểm_soát được điện ở vườn sau 2 trong nhà không', '7': 'tôi muốn giảm độ sáng của kịch_bản 3', '8': 'chào bạn kiểm_soát được bóng compact thứ 4 ở phòng ông_bà 4 trong nhà không', '9': 'bạn cho cái đèn vách 4 xuống 7 phần_trăm hộ tôi với', '10': 'chào bạn kiểm_soát được đèn sân 3 ở phòng ăn 3 trong nhà không', '11': 'giúp tui giảm thiết_bị đèn bàn thứ 3', '12': 'bạn cho cái cảnh xuống 2 phần_trăm hộ tôi với', '13': 'chào giảm giúp tôi đèn trong phòng nữ', '14': 'giúp tui giảm thiết_bị rèm ngang', '15': 'làm_ơn hạ thấp đỡ bóng đứng 2 7 phần_trăm giúp tôi', '16': 'giảm giúp mình màn cửa đôi 1 ở phòng thu âm xuống mức 24 phần_trăm với', '17': 'giảm giúp mình bình_nóng_lạnh t

In [None]:
print(dev_rel_docs)

{'0': {'7', '29', '31', '35', '21', '13', '39', '17', '36', '26', '37', '8', '0', '20', '28', '12', '9', '38', '41', '42', '44', '4', '32', '34', '40', '23', '14', '16', '24', '15', '30', '43', '22', '6', '19', '25', '3', '11', '2', '5', '33', '10', '1', '18', '27'}, '1': {'7', '29', '31', '35', '21', '13', '39', '17', '36', '26', '37', '8', '0', '20', '28', '12', '9', '38', '41', '42', '44', '4', '32', '34', '40', '23', '14', '16', '24', '15', '30', '43', '22', '6', '19', '25', '3', '11', '2', '5', '33', '10', '1', '18', '27'}, '2': {'7', '29', '31', '35', '21', '13', '39', '17', '36', '26', '37', '8', '0', '20', '28', '12', '9', '38', '41', '42', '44', '4', '32', '34', '40', '23', '14', '16', '24', '15', '30', '43', '22', '6', '19', '25', '3', '11', '2', '5', '33', '10', '1', '18', '27'}, '3': {'7', '29', '31', '35', '21', '13', '39', '17', '36', '26', '37', '8', '0', '20', '28', '12', '9', '38', '41', '42', '44', '4', '32', '34', '40', '23', '14', '16', '24', '15', '30', '43', '22',

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.0


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Sep 11 16:01:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [None]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
from collections import defaultdict
from torch.utils.data import IterableDataset
import pandas as pd
from datasets import Dataset

  from tqdm.autonotebook import tqdm, trange


In [None]:
class TripletsDataset(IterableDataset):
    def __init__(self, model, queries, corpus, train_triplets):
        self.model = model
        self.queries = queries
        self.corpus = corpus
        self.train_triplets = train_triplets

    def __iter__(self):
        count = 0

        for triplet in self.train_triplets:
            qid, pos_id, neg_id = triplet
            qid = str(qid)
            pos_id = str(pos_id)
            neg_id = str(neg_id)
            query_text = self.corpus[qid]
            pos_text = self.corpus[pos_id]
            neg_text = self.corpus[neg_id]
            yield InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(train_triplets)

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout
train_batch_size = 256 #Increasing the train batch size improves the model performance, but requires more GPU memory

# The  model we want to fine-tune


model_name = "vinai/phobert-base"
word_embedding_model = models.Transformer(model_name, max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

import json
model_save_path = '/content/drive/MyDrive/ViNLU/phobert-base'+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# dev_queries = {'1': "hello everyone",
#                '9': "fuck it",
#                '3': "goodbye",
#                '7': "oh shit"}

# corpus = {'1': "hello everyone",
#           '2': "funny guys",
#           '9': "fuck it",
#           '3': "goodbye",
#           '4': "bye guys",
#           '5': "welcome lady and gentlemen",
#           '6': "god damn",
#           '8': "thank you",
#           '7': "oh shit"}


# train_triplets = [['1', '3', '2'],
#                   ['1', '5', '4'],
#                   ['1', '8', '6'],
#                   ['5', '8', '9'],
#                   ['3', '5', '7']]

# 1 3 5 8
# 2 4 6 7 9

# dev_rel_docs = {'1': set(['1', '3', '5', '8']),
#                 '2': set(['2', '4', '6', '7', '9']),
#                 '3': set(['1', '3', '5', '8']),
#                 '4': set(['2', '4', '6', '7', '9']),
#                 '5': set(['1', '3', '5', '8']),
#                 '6': set(['2', '4', '6', '7', '9']),
#                 '7': set(['2', '4', '6', '7', '9']),
#                 '8': set(['1', '3', '5', '8']),
#                 '9': set(['2', '4', '6', '7', '9'])}

ir_evaluator = evaluation.InformationRetrievalEvaluator(dev_queries, corpus, dev_rel_docs, name='phobert-base_train_eval')

# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
train_dataset = TripletsDataset(model=model, queries=corpus, corpus=corpus, train_triplets=train_triplets)
train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=ir_evaluator,
          epochs=50,
          warmup_steps=30,
          output_path=model_save_path,
          evaluation_steps=4,
          save_best_model=True,
          use_amp=True)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Phobert-base Train Eval Cosine Accuracy@1,Phobert-base Train Eval Cosine Accuracy@3,Phobert-base Train Eval Cosine Accuracy@5,Phobert-base Train Eval Cosine Accuracy@10,Phobert-base Train Eval Cosine Precision@1,Phobert-base Train Eval Cosine Precision@3,Phobert-base Train Eval Cosine Precision@5,Phobert-base Train Eval Cosine Precision@10,Phobert-base Train Eval Cosine Recall@1,Phobert-base Train Eval Cosine Recall@3,Phobert-base Train Eval Cosine Recall@5,Phobert-base Train Eval Cosine Recall@10,Phobert-base Train Eval Cosine Ndcg@10,Phobert-base Train Eval Cosine Mrr@10,Phobert-base Train Eval Cosine Map@100,Phobert-base Train Eval Dot Accuracy@1,Phobert-base Train Eval Dot Accuracy@3,Phobert-base Train Eval Dot Accuracy@5,Phobert-base Train Eval Dot Accuracy@10,Phobert-base Train Eval Dot Precision@1,Phobert-base Train Eval Dot Precision@3,Phobert-base Train Eval Dot Precision@5,Phobert-base Train Eval Dot Precision@10,Phobert-base Train Eval Dot Recall@1,Phobert-base Train Eval Dot Recall@3,Phobert-base Train Eval Dot Recall@5,Phobert-base Train Eval Dot Recall@10,Phobert-base Train Eval Dot Ndcg@10,Phobert-base Train Eval Dot Mrr@10,Phobert-base Train Eval Dot Map@100
4,No log,No log,0.033163,0.07398,0.109694,0.191327,0.033163,0.027211,0.025,0.023214,0.000809,0.002019,0.003082,0.005597,0.024594,0.067754,0.005198,0.005102,0.043367,0.086735,0.181122,0.005102,0.015306,0.018367,0.020663,0.000128,0.001115,0.002207,0.005311,0.01833,0.042376,0.003645
8,No log,No log,0.033163,0.071429,0.109694,0.186224,0.033163,0.026361,0.025,0.022704,0.000809,0.001963,0.003082,0.00546,0.024413,0.068044,0.00521,0.005102,0.040816,0.086735,0.178571,0.005102,0.014456,0.018367,0.020408,0.000128,0.001051,0.002207,0.005254,0.018125,0.04213,0.003629
12,No log,No log,0.035714,0.071429,0.107143,0.188776,0.035714,0.02551,0.02398,0.023214,0.000865,0.001899,0.002955,0.005604,0.024706,0.068712,0.005187,0.007653,0.043367,0.076531,0.170918,0.007653,0.015306,0.015816,0.019643,0.000184,0.001115,0.001888,0.005129,0.017597,0.041988,0.003564
16,No log,No log,0.033163,0.076531,0.09949,0.183673,0.033163,0.027211,0.022449,0.022449,0.000819,0.002026,0.002778,0.005448,0.023873,0.066763,0.005081,0.005102,0.033163,0.071429,0.147959,0.005102,0.011054,0.014796,0.016071,0.00012,0.000785,0.001791,0.004247,0.014496,0.035731,0.003339
20,No log,No log,0.035714,0.066327,0.09949,0.183673,0.035714,0.02381,0.021429,0.022959,0.000908,0.001803,0.002671,0.005572,0.023941,0.066163,0.005094,0.002551,0.043367,0.076531,0.170918,0.002551,0.014456,0.015306,0.019133,7.3e-05,0.001045,0.001856,0.004701,0.016179,0.036381,0.003547
24,No log,No log,0.038265,0.061224,0.086735,0.163265,0.038265,0.022109,0.018878,0.020918,0.000972,0.001682,0.00237,0.005136,0.022637,0.064303,0.005346,0.010204,0.038265,0.086735,0.19898,0.010204,0.014456,0.019388,0.023469,0.000243,0.001041,0.002366,0.005845,0.020519,0.046355,0.004391
28,No log,No log,0.028061,0.063776,0.079082,0.160714,0.028061,0.02381,0.018878,0.020918,0.000706,0.001803,0.002349,0.005155,0.021736,0.057574,0.005795,0.012755,0.053571,0.117347,0.219388,0.012755,0.018707,0.025,0.02551,0.000307,0.001322,0.003008,0.006149,0.023171,0.055958,0.005299
32,No log,No log,0.020408,0.053571,0.079082,0.158163,0.020408,0.018707,0.018878,0.021429,0.000489,0.001375,0.002321,0.005282,0.020592,0.049252,0.006255,0.012755,0.035714,0.089286,0.22449,0.012755,0.011905,0.017857,0.02449,0.000312,0.00084,0.002131,0.005831,0.021328,0.051509,0.005777
36,No log,No log,0.012755,0.053571,0.079082,0.135204,0.012755,0.020408,0.019388,0.019643,0.000312,0.001502,0.002374,0.004797,0.019102,0.043504,0.007248,0.020408,0.038265,0.066327,0.165816,0.020408,0.013605,0.014286,0.019898,0.00051,0.000999,0.001719,0.0047,0.018812,0.047833,0.007282
40,No log,No log,0.020408,0.038265,0.068878,0.137755,0.020408,0.014456,0.017857,0.021429,0.000503,0.001077,0.002168,0.005251,0.020335,0.044791,0.008286,0.015306,0.035714,0.066327,0.158163,0.015306,0.011905,0.014796,0.020408,0.000383,0.000879,0.001793,0.004866,0.018455,0.042779,0.008769


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
model.evaluate(evaluator=ir_evaluator)

{'phobert-base_train_eval_cosine_accuracy@1': 0.02040816326530612,
 'phobert-base_train_eval_cosine_accuracy@3': 0.05357142857142857,
 'phobert-base_train_eval_cosine_accuracy@5': 0.07908163265306123,
 'phobert-base_train_eval_cosine_accuracy@10': 0.10459183673469388,
 'phobert-base_train_eval_cosine_precision@1': 0.02040816326530612,
 'phobert-base_train_eval_cosine_precision@3': 0.02040816326530612,
 'phobert-base_train_eval_cosine_precision@5': 0.021938775510204084,
 'phobert-base_train_eval_cosine_precision@10': 0.021683673469387755,
 'phobert-base_train_eval_cosine_recall@1': 0.000510204081632653,
 'phobert-base_train_eval_cosine_recall@3': 0.001530612244897959,
 'phobert-base_train_eval_cosine_recall@5': 0.0027423469387755106,
 'phobert-base_train_eval_cosine_recall@10': 0.005420918367346939,
 'phobert-base_train_eval_cosine_ndcg@10': 0.021764786109351942,
 'phobert-base_train_eval_cosine_mrr@10': 0.044686386459345646,
 'phobert-base_train_eval_cosine_map@100': 0.0132013074388901

In [None]:
sentences = [
    "làm_ơn hạ xuống đỡ đèn treo tường thứ 4 1 phần_trăm giúp tôi",
    "điện 2 to quá giảm còn 4 phần_trăm giúp tôi",
    "à biết r kiểm_tra cho tôi quạt_thông_gió ở phòng ngủ các con nhé v",
]

In [None]:
# Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)

(3, 768)


In [None]:
# Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.7701, 0.1864],
        [0.7701, 1.0000, 0.2681],
        [0.1864, 0.2681, 1.0000]])


In [None]:
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [None]:
bert = model[0].auto_model

In [None]:
bert.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [None]:
from google.colab import userdata
hf_token = userdata.get('clphobert-base')
from huggingface_hub import login
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
bert.push_to_hub("phongtintruong/clphobert-base-16-256-50-30")


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/phongtintruong/clphobert-base-16-256-50-30/commit/dd3abe0d14dd8344f90fac12da1434452e7769eb', commit_message='Upload model', commit_description='', oid='dd3abe0d14dd8344f90fac12da1434452e7769eb', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
tokenizer.push_to_hub("phongtintruong/clphobert-base-16-256-50-30")



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/phongtintruong/clphobert-base-16-256-50-30/commit/cd137746cf12d538a8d704e5520b594e2335dc1e', commit_message='Upload tokenizer', commit_description='', oid='cd137746cf12d538a8d704e5520b594e2335dc1e', pr_url=None, pr_revision=None, pr_num=None)

In [2]:
!ls

sample_data  ViNLU


In [3]:
%cd ViNLU

/content/ViNLU


In [4]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [5]:
!git config user.name "phongtintruong"
!git config user.email "trung.nt204857@sis.hust.edu.vn"

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
