In [7]:
import tensorflow as tf
from datasets import load_dataset
import torch
import re
from torch.utils.data import Dataset, DataLoader
from torchmetrics import F1Score, Precision, Accuracy
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from peft import get_peft_model, LoraConfig, TaskType
import pytorch_lightning as pl
import tqdm
import nltk
import pandas as pd
from nltk.corpus import stopwords
from torch.nn import functional as F
from pytorch_lightning.loggers import TensorBoardLogger
from torch.nn.utils.rnn import pad_sequence
from huggingface_hub.hf_api import HfFolder
from pytorch_lightning.callbacks import ModelCheckpoint
from random import shuffle
import numpy as np
HfFolder.save_token('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')

Mounted at /content/drive


In [9]:
class DualEncoderModel(pl.LightningModule):
    def __init__(self, model_name='bert-base-multilingual-cased', learning_rate=1e-5, lora_r=1024, lora_alpha=1024):
        super().__init__()
        self.learning_rate = learning_rate
        self.threshold = 0.35

        # Инициализация энкодеров вопросов и ответов (BERT)
        self.question_encoder = BertModel.from_pretrained(model_name)
        self.answer_encoder = BertModel.from_pretrained(model_name)

        # Настройка LoRA для обоих энкодеров
        lora_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,  # Задача на извлечение признаков
            r=lora_r,
            lora_alpha=lora_alpha,
            bias="all"
        )
        self.question_encoder = get_peft_model(self.question_encoder, lora_config)
        self.answer_encoder = get_peft_model(self.answer_encoder, lora_config)

        # Метрики
        self.f1_metric = F1Score(num_classes=2, task='binary')
        self.precision_metric = Precision(num_classes=2, task='binary')
        self.accuracy_metric = Accuracy(num_classes=2, task='binary')

    def forward(self, question_inputs, answer_inputs):
        # Кодирование вопросов и ответов с использованием энкодеров
        question_embeddings = self.question_encoder(
            input_ids=question_inputs['input_ids'],
            attention_mask=question_inputs['attention_mask'],
            return_dict=True
        ).pooler_output  # Используем pooler_output для представления

        answer_embeddings = self.answer_encoder(
            input_ids=answer_inputs['input_ids'],
            attention_mask=answer_inputs['attention_mask'],
            return_dict=True
        ).pooler_output

        # Нормализуем эмбеддинги для вычисления косинусного сходства
        question_embeddings = F.normalize(question_embeddings, p=2, dim=1)
        answer_embeddings = F.normalize(answer_embeddings, p=2, dim=1)

        # Вычисляем косинусное сходство между вопросом и ответом

        return question_embeddings, answer_embeddings

    def training_step(self, batch, batch_idx):
        # Входные данные для вопросов и ответов
        question_inputs = {
            'input_ids': batch['question_input_ids'],
            'attention_mask': batch['question_attention_mask']
        }
        answer_inputs = {
            'input_ids': batch['answer_input_ids'],
            'attention_mask': batch['answer_attention_mask']
        }
        labels = batch['labels']

        question_embeddings, answer_embeddings = self(question_inputs, answer_inputs)
        # Предсказания модели (косинусное сходство)
        similarity_scores = cosine_similarity = torch.matmul(question_embeddings, answer_embeddings.T)

        # Переводим косинусное сходство в логиты для бинарной классификации (0 или 1)
        logits = similarity_scores.diag()  # Берем диагональ, т.к. это правильные пары
        pos_weight = torch.tensor([1.8]).to(logits.device)  # Штрафуем сильнее за неправильные ответы
        loss = F.binary_cross_entropy_with_logits(logits, labels.float(), pos_weight=pos_weight)


        # Логируем метрики
        preds = torch.where(torch.sigmoid(logits) >= self.threshold, 1, 0)

        accuracy = self.accuracy_metric(preds, labels)
        self.log('train_accuracy', accuracy, prog_bar=True, logger=True)

        f1 = self.f1_metric(preds, labels)
        self.log('train_f1', f1, prog_bar=True, logger=True)

        precision = self.accuracy_metric(preds, labels)
        self.log('train_precision', precision, prog_bar=True, logger=True)

        # Логируем лосс
        self.log('train_loss', loss, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        # Входные данные для вопросов и ответов
        question_inputs = {
            'input_ids': batch['question_input_ids'],
            'attention_mask': batch['question_attention_mask']
        }
        answer_inputs = {
            'input_ids': batch['answer_input_ids'],
            'attention_mask': batch['answer_attention_mask']
        }
        labels = batch['labels']

        question_embeddings, answer_embeddings = self(question_inputs, answer_inputs)
        # Предсказания модели (косинусное сходство)
        similarity_scores = cosine_similarity = torch.matmul(question_embeddings, answer_embeddings.T)

        logits = similarity_scores.diag()
        pos_weight = torch.tensor([1.8]).to(logits.device)  # Штрафуем сильнее за неправильные ответы
        loss = F.binary_cross_entropy_with_logits(logits, labels.float(), pos_weight=pos_weight)

        # Логирование лосса
        self.log('val_loss', loss, prog_bar=True, logger=True)

        # Логирование метрик
        preds = torch.where(torch.sigmoid(logits) >= self.threshold, 1, 0)
        f1 = self.f1_metric(preds, labels)
        precision = self.precision_metric(preds, labels)

        self.log('val_f1', f1, prog_bar=True, logger=True)
        self.log('val_precision', precision, prog_bar=True, logger=True)

        return loss
    def configure_optimizers(self):
        # Оптимизатор AdamW для обучения модели
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [18]:
checkpoint_path = "CHK_PATH"

# Загрузка модели из чекпоинта
model = DualEncoderModel.load_from_checkpoint(checkpoint_path)
model.eval()

DualEncoderModel(
  (question_encoder): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(119547, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=76

In [None]:
# from transformers import AutoTokenizer, AutoModel

# # Загрузка токенизатора и модели
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

In [11]:
#Загрузка датасетов

df_train = pd.read_csv('train_path')
df_test = pd.read_csv('test_apth')

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



In [22]:

#Получение эмбедингов
# def get_embs(questions, answers):
#   question_token = [tokenizer(question, padding='max_length',
#             truncation=True,
#             max_length=128,
#             return_tensors="pt"
#             ) for question in questions]
#   answer_token = [tokenizer(answer, padding='max_length',
#             truncation=True,
#             max_length=128,
#             return_tensors="pt"
#             ) for answer in answers]
#   with torch.no_grad():
#     questions = np.array([np.array(model(**token).last_hidden_state) for token in question_token])[:, 0, 0, :]
#     answers = np.array([np.array(model(**token).last_hidden_state) for token in answer_token])[:, 0, 0, :]
#   for i in range(len(questions)):
#     questions[i]/=np.linalg.norm(questions[i])
#     answers[i]/=np.linalg.norm(answers[i])
#   return (questions, answers)
def get_embs(questions, answers):
  question_token = [tokenizer(question, padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
            ) for question in questions]
  answer_token = [tokenizer(answer, padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
            ) for answer in answers]
  with torch.no_grad():
    data = [model(question_token[i], answer_token[i]) for i in range(len(question_token))]
  questions = np.array(data)[:, 0]
  answers = np.array(data)[:, 1]
  questions = questions.reshape(-1, len(question_token), 768)[0, :, :]
  answers = answers.reshape(-1, len(question_token), 768)[0, :, :]
  for i in range(len(questions)):
    questions[i]/=np.linalg.norm(questions[i])
    answers[i]/=np.linalg.norm(answers[i])
  return (questions, answers)

In [16]:
def preprocess_text(text, language):
    """
    Удаляет стоп-слова, специальные символы и знаки препинания из текста.

    Args:
        text: Текст для обработки.
        language: Язык текста (например, 'russian', 'english').

    Returns:
        Обработанный текст.
    """
    text = re.sub(r'[^\w\s]', '', text) # Удаляем специальные символы и знаки препинания
    text = text.lower() # Переводим текст в нижний регистр
    #stop_words = set(stopwords.words(language))
    #words = text.split()
    #words = [word for word in words if word not in stop_words]
    #text = " ".join(words)
    return text


In [1]:
num = 1
train_questions, train_answers = get_embs(df_train['question'][0:num], df_train['answer'][0:num])
train_labels = df_train['label'][0:num]

NameError: name 'get_embs' is not defined

In [17]:
unique_questions = np.unique(train_questions,axis = 0)

In [128]:
np.array([[1,1],[1,1]])-np.array([1,1])

array([[0, 0],
       [0, 0]])

In [130]:
goods = []
bads = []
for question in unique_questions:
  mask = (train_questions == question).min(axis = 1)
  cur_answers = train_answers[mask]
  cur_labels = train_labels[mask]
  label_mask = cur_labels == 1
  good_answer = (cur_answers[label_mask] - question)
  bad_answer = (cur_answers[~label_mask] - question)
  for i in range(len(good_answer)):
    good_answer[i]/=np.linalg.norm(good_answer[i])
  for i in range(len(bad_answer)):
    bad_answer[i]/=np.linalg.norm(bad_answer[i])
  good_answer = good_answer.mean(axis = 0)
  bad_answer = bad_answer.mean(axis = 0)

  goods.append(good_answer)
  bads.append(bad_answer)
goods = np.array(goods)
bads = np.array(bads)
unique_questions = unique_questions[~np.isnan(goods).min(axis = 1)]
goods = goods[~np.isnan(goods).min(axis = 1)]
bads = bads[~np.isnan(goods).min(axis = 1)]
unique_questions = unique_questions[~np.isnan(bads).min(axis = 1)]
goods = goods[~np.isnan(bads).min(axis = 1)]
bads = bads[~np.isnan(bads).min(axis = 1)]
goods -= unique_questions
bads -= unique_questions

In [105]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
def cosine_similarity(y_true, y_pred):
    num_samples = len(y_true)
    dot_product = np.sum(np.multiply(y_true, y_pred))
    norm_a = np.linalg.norm(y_true)
    norm_b = np.linalg.norm(y_pred)
    return dot_product / (norm_a * norm_b + 1e-7)

def custom_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    score = cosine_similarity(y, y_pred)
    # Возвращаем отрицательное значение, потому что GridSearch ищет минимум
    return -score
class model1():
  def __init__(self, estimator, parameters, trashhold = 0.5, bads_and_goods = False):
    self.estimator = estimator
    self.parameters = parameters
    self.trashhold = trashhold
    self.bads_and_goods = bads_and_goods
    self.model = None

  def fit(self, query, goods, bads = None):
    if self.bads_and_goods:
      target = np.concatenate((goods,bads), axis = 1)
    else:
      target = goods
    clf = GridSearchCV(self.estimator, self.parameters, scoring=custom_scorer)
    clf.fit(query, target)
    self.model = clf

  def cosine(self, x, y):

    return x @ y.T/ (np.linalg.norm(x) * np.linalg.norm(y))

  def predict(self, query, answer):
    target = self.model.predict(query)
    if self.bads_and_goods:
      good_target = target[:,0:int(target.shape[1]/2)]
      bad_target = target[:,int(target.shape[1]/2)-1:-1]

      return np.array([(self.cosine(good_target[i], answer[i] - query[i]) + 1) * self.trashhold >   (self.cosine(bad_target[i], answer[i] - query[i]) + 1) for i in range(query.shape[0])])

    return np.array([self.cosine(target[i], answer[i]- query[i]) > self.trashhold for i in range(query.shape[0])])

  def score(self, questions, answers, target):
    prediction = self.predict(questions, answers)
    return (prediction == target).mean()






In [67]:
#Использую PCA
from sklearn.decomposition import PCA #

pca_question = PCA(n_components = 5) # Оставим только 5 признаков
pca_question.fit(unique_questions)
pca_answer = PCA(n_components = 5) # Оставим только 5 признаков
pca_answer.fit(goods)

train_questions_reduced = pca_question.transform(unique_questions) # Преобразование координат
train_answer_reduced = pca_answer.transform(goods) # Преобразование координат
train_bad_answer = pca_answer.transform(bads)

In [78]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV

In [36]:
#Готовлю тестовые данные
test_num = 100
test_questions, test_answers = get_embs(df_test['question'][0:test_num], df_test['answer'][0:test_num])
test_labels = df_train['label'][0:test_num]

In [132]:

#Учу модель
param_grid = {
    'estimator__n_estimators': [50, 100, 150],  # Количество деревьев в лесу
    'estimator__max_depth': [None, 5, 10],      # Максимальная глубина дерева
    'estimator__min_samples_split': [2, 5, 10]  # Минимальное количество образцов для разделения узла
}
estimator = MultiOutputRegressor(RandomForestRegressor())
parameters = {'n_jobs': [-1]}
clf = model1(estimator, param_grid, trashhold=0, bads_and_goods=False)
clf.fit(train_questions_reduced, train_answer_reduced, train_bad_answer)
for trashhold in np.linspace(0.2, 1, 10)[::-1]:
  clf.trashhold = trashhold
  preds = clf.predict(pca_question.transform(test_questions), pca_answer.transform(test_answers))
  print(f'Precision: {precision_score(test_labels, preds)},Recall: {recall_score(test_labels, preds)}, Accuracy: {accuracy_score(test_labels, preds)}, Trashhold: {trashhold}')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.0,Recall: 0.0, Accuracy: 0.81, Trashhold: 1.0
Precision: 0.375,Recall: 0.3157894736842105, Accuracy: 0.77, Trashhold: 0.9111111111111112
Precision: 0.25,Recall: 0.3157894736842105, Accuracy: 0.69, Trashhold: 0.8222222222222222
Precision: 0.2692307692307692,Recall: 0.3684210526315789, Accuracy: 0.69, Trashhold: 0.7333333333333334
Precision: 0.26666666666666666,Recall: 0.42105263157894735, Accuracy: 0.67, Trashhold: 0.6444444444444445
Precision: 0.2571428571428571,Recall: 0.47368421052631576, Accuracy: 0.64, Trashhold: 0.5555555555555556
Precision: 0.225,Recall: 0.47368421052631576, Accuracy: 0.59, Trashhold: 0.4666666666666667
Precision: 0.24444444444444444,Recall: 0.5789473684210527, Accuracy: 0.58, Trashhold: 0.37777777777777777
Precision: 0.23076923076923078,Recall: 0.631578947368421, Accuracy: 0.53, Trashhold: 0.2888888888888889
Precision: 0.2222222222222222,Recall: 0.631578947368421, Accuracy: 0.51, Trashhold: 0.2
