In [6]:
from transformers import AutoModelForPreTraining, AutoTokenizer, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained("zjkarina/LaBSE-instructDialogs")
print(tokenizer.vocab_size)
model = AutoModelForPreTraining.from_pretrained("zjkarina/LaBSE-instructDialogs")
model.to('cuda')

55083


BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(55083, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

# Data

In [None]:
from datasets import load_dataset

dataset = load_dataset("MBZUAI/LaMini-instruction")

In [None]:
import pandas as pd
df = pd.DataFrame(dataset["train"][:4000])
df.drop("instruction_source", axis=1, inplace=True)

In [7]:
# Инициализируем список для хранения всех векторов предложений
from tqdm.auto import tqdm
all_sentence_embeddings1 = []
batch_size = 500
# Получаем количество сэмплов
num_samples = len(df['response'].tolist())
  
# Проходим по всем сэмплам по batch_size
for i in tqdm(range(0, num_samples, batch_size)):
    # Выбираем текущий батч
    sentence_batch = df['response'].tolist()[i:i+batch_size]

    # Tokenize sentences
    encoded_input1 = tokenizer(sentence_batch, padding=True, truncation=True, max_length=512, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output1 = model.bert(**encoded_input1.to('cuda'))

    # Perform pooling. In this case, mean pooling
    sentence_embeddings1 = torch.nn.functional.normalize(model_output1.pooler_output)

    # Добавляем вектора предложений текущего батча в список
    all_sentence_embeddings1.append(sentence_embeddings1)

# Конкатенируем все вектора предложений
sentence_embeddings1 = torch.cat(all_sentence_embeddings1, dim=0)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:25<00:00,  3.16s/it]


In [8]:
len(sentence_embeddings1)

4000

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
#pas@5
cum_sum_1 = 0
cum_sum_3 = 0
cum_sum_5 = 0
cum_sum_10 = 0
cum_sum_15 = 0
for index in tqdm(df.index):
    question = df['instruction'][index]
    encoded_question = tokenizer(question, padding=True, truncation=True, max_length=24, return_tensors='pt')
    with torch.no_grad():
        model_out = model.bert(**encoded_question.to('cuda'))
    question_embedding = torch.nn.functional.normalize(model_out.pooler_output)
    cos_similarities = cosine_similarity(question_embedding.cpu().numpy(), sentence_embeddings1.cpu().numpy())[0]
    df["rank"]= cos_similarities
    rank_s = df["rank"].sort_values(ascending=False)
    if index in rank_s[:1].index:
        cum_sum_1 += 1
    if index in rank_s[:3].index:
        cum_sum_3 += 1
    if index in rank_s[:5].index:
        cum_sum_5 += 1
    if index in rank_s[:10].index:
        cum_sum_10 += 1
    if index in rank_s[:15].index:
        cum_sum_15 += 1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [01:14<00:00, 53.45it/s]


In [10]:
RR = 0
for index in tqdm(df.index):
    try:
        question = df['instruction'][index]
        encoded_question = tokenizer(question, padding=True, truncation=True, max_length=24, return_tensors='pt')
        with torch.no_grad():
            model_out = model.bert(**encoded_question.to('cuda'))
        question_embedding = torch.nn.functional.normalize(model_out.pooler_output)
        cos_similarities = cosine_similarity(question_embedding.cpu().numpy(), sentence_embeddings1.cpu().numpy())[0]
        df["rank"]= cos_similarities
        rank_s = df["rank"].sort_values(ascending=False)
        RR += 1/(list(rank_s.index).index(index)+1)
    except:
        print(index)
print(f"Mean reciprocal rank: {round((1/len(df))*RR,2)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [01:22<00:00, 48.22it/s]

Mean reciprocal rank: 0.62





In [9]:
print(f"Результат работы p@1: {int(100*cum_sum_1/len(df))} %")
print(f"Результат работы p@3: {int(100*cum_sum_3/len(df))} %")
print(f"Результат работы p@5: {int(100*cum_sum_5/len(df))} %")
print(f"Результат работы p@10: {int(100*cum_sum_10/len(df))} %")
print(f"Результат работы p@15: {int(100*cum_sum_15/len(df))} %")

Результат работы p@1: 52 %
Результат работы p@3: 67 %
Результат работы p@5: 72 %
Результат работы p@10: 79 %
Результат работы p@15: 82 %


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
responce = 'Storms are most common in the North Indian basin during the winter months, from December to March.'
encoded_responce = tokenizer(responce, padding=True, truncation=True, max_length=24, return_tensors='pt')
with torch.no_grad():
    model_out_res = model.bert(**encoded_responce.to('cuda'))
sentence_embeddings1 = torch.nn.functional.normalize(model_out_res.pooler_output)
question = '''
the North Indian basin, storms are most common from April to December, with peaks in May and November.
It's important to note that this information pertains specifically to the North Indian basin and may not apply to other regions. 
Additionally, thunderstorm seasons in the United States, Canada, and the Southern Hemisphere may vary. 
If you have any further questions or concerns, feel free to ask.
'''
encoded_question = tokenizer(question, padding=True, truncation=True, max_length=24, return_tensors='pt')
with torch.no_grad():
    model_out = model.bert(**encoded_question.to('cuda'))
question_embedding = torch.nn.functional.normalize(model_out.pooler_output)
cos_similarities = cosine_similarity(question_embedding.cpu().numpy(), sentence_embeddings1.cpu().numpy())[0]

In [15]:
cos_similarities

array([0.80457914], dtype=float32)