# Скачаем данные и предвычислим эмбеддинги для BI-енкодера

Датасет с полным набором реплик из сериала доктор Хаус https://www.kaggle.com/datasets/kunalbhar/house-md-transcripts 

Преобразуем в формат "запрос-ответ" и оставим только пары с ответам Хауса

In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

# Load a DataFrame with a specific version of a CSV
season2_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season2.csv",
)

season3_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season3.csv",
)

season4_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season4.csv",
)

season5_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season5.csv",
)

season6_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season6.csv",
)

season7_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season7.csv",
)

season8_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "kunalbhar/house-md-transcripts",
    "season8.csv",
)

full_df = pd.concat([season2_df, season3_df, season4_df, season5_df, season6_df, season7_df, season8_df])
# full_df = full_df[full_df['name'] == 'House'].copy()

In [2]:
house_answers = full_df
house_answers['responder'] = house_answers['name'].shift(-1)
house_answers['response'] = house_answers['line'].shift(-1)
house_answers = house_answers[house_answers['responder'] == 'House'].copy()

In [3]:
house_answers.head()

Unnamed: 0,name,line,responder,response
18,James,You can't go in there.,House,"Who are you, and why are you wearing a tie?"
20,James,I'm Dr. Cuddy's new assistant. Can I tell her...,House,Yes. I would like to know why she gets a secr...
22,James,"I'm her assistant, not her secretary. I gradu...",House,Hmm. I didn't know they had a secretarial sch...
24,Cuddy,"Dr. House, we are in the middle of a meeting.",House,What's with hiring a male secretary? JDate no...
26,Stacy,He is cute. Be careful.,House,She's not like you. She can't just walk into ...


In [4]:
house_answers.shape

(17873, 4)

In [5]:
house_answers.to_csv('data/house_answers.csv', index=False)

# Добавим ембеддинги в датафрейм

Предвычисленные эмбеддинги ускорят вычисления

In [1]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import datasets

In [2]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained("models/bi_model")

### BI модель

In [3]:
# def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
#     in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
#     pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
#     return pool


# def encode(input_texts, tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
# ) -> torch.tensor:

#     model.eval()
#     tokenized_texts = tokenizer(input_texts, max_length=128,
#                                 padding='max_length', truncation=True, return_tensors="pt")
#     token_embeds = model(tokenized_texts["input_ids"].to(device),
#                          tokenized_texts["attention_mask"].to(device)).last_hidden_state
#     pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
#     return pooled_embeds

def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
    return pool


def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:

    model.eval()
    tokenized_texts = tokenizer(input_texts, max_length=128,
                                padding='max_length', truncation=True, return_tensors="pt")
    token_embeds = model(tokenized_texts["input_ids"].to(device),
                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
    pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
    return pooled_embeds

class BiEncoder(torch.nn.Module):
    def __init__(self, max_length: int = 128):
        super().__init__()
        self.max_length = max_length
        self.bert_model = AutoModel.from_pretrained(model_name)
        # self.bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size * 3, 3)

    def forward(self, data: datasets.arrow_dataset.Dataset) -> torch.tensor:
        premise_input_ids = data["premise_input_ids"].to(device)
        premise_attention_mask = data["premise_attention_mask"].to(device)
        hypothesis_input_ids = data["hypothesis_input_ids"].to(device)
        hypothesis_attention_mask = data["hypothesis_attention_mask"].to(device)

        out_premise = self.bert_model(premise_input_ids, premise_attention_mask)
        out_hypothesis = self.bert_model(hypothesis_input_ids, hypothesis_attention_mask)
        premise_embeds = out_premise.last_hidden_state
        hypothesis_embeds = out_hypothesis.last_hidden_state

        pooled_premise_embeds = mean_pool(premise_embeds, premise_attention_mask)
        pooled_hypotheses_embeds = mean_pool(hypothesis_embeds, hypothesis_attention_mask)

        embeds =  torch.cat([pooled_premise_embeds, pooled_hypotheses_embeds,
                             torch.abs(pooled_premise_embeds - pooled_hypotheses_embeds)],
                            dim=-1)
        return self.linear(embeds)


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
# Инициализируем модель
bi_model = BiEncoder(model_name)
bi_model.load_state_dict(torch.load('models/BI_model_last/BI_model.pth', weights_only=True, map_location=torch.device('cpu')))
bi_model.to(device)

BiEncoder(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise

In [15]:
def embed_texts(input_texts, tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:
    # model.eval()
    # tokenized_texts = tokenizer(input_texts, max_length=128,
    #                             padding='max_length', truncation=True, return_tensors="pt")
    # token_embeds = model(tokenized_texts["input_ids"].to(device),
    #                      tokenized_texts["attention_mask"].to(device)).last_hidden_state
    
    # return token_embeds
    model.eval()
    tokenized_texts = tokenizer(input_texts, max_length=128,
                                padding='max_length', truncation=True, return_tensors="pt")
    token_embeds = model(tokenized_texts["input_ids"].to(device),
                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
    # pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
    # return pooled_embeds
    return token_embeds

In [8]:
# # если нужно, загружаем датасет
# import pandas as pd 
# house_answers = pd.read_csv('data/house_answers.csv')

Вычисляем эмбеддинги частями, чтобы не переполнилась память

In [19]:
step = 5
embeds = []
character_remark = house_answers['line'].to_list() 

for i in tqdm(range(0, house_answers.shape[0], step)):
    input_texts = character_remark[i:i+step]
    embed = embed_texts(input_texts, tokenizer, bi_model.bert_model, device)
    embed = list(embed.detach().numpy())
    embeds.extend(embed)


100%|██████████| 3575/3575 [04:18<00:00, 13.85it/s]


In [20]:
tokenized_texts = tokenizer(house_answers['line'].to_list() , max_length=128,
                                padding='max_length', truncation=True, return_tensors="pt")

In [21]:
token_embeds = torch.tensor(embeds)

  token_embeds = torch.tensor(embeds)


In [22]:
pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))

In [26]:
pooled_embeds.shape

torch.Size([17873, 384])

In [23]:
house_answers['pooled_embeds'] = pooled_embeds.tolist()

In [24]:
house_answers.head()

Unnamed: 0,name,line,responder,response,token_embeds,pooled_embeds
0,James,You can't go in there.,House,"Who are you, and why are you wearing a tie?",0,"[-0.34322458505630493, 0.44701823592185974, 0...."
1,James,I'm Dr. Cuddy's new assistant. Can I tell her...,House,Yes. I would like to know why she gets a secr...,0,"[0.11418674141168594, 0.38850608468055725, 0.8..."
2,James,"I'm her assistant, not her secretary. I gradu...",House,Hmm. I didn't know they had a secretarial sch...,0,"[-0.06293102353811264, 0.40018171072006226, 0...."
3,Cuddy,"Dr. House, we are in the middle of a meeting.",House,What's with hiring a male secretary? JDate no...,0,"[-0.1304904669523239, 0.42438873648643494, 0.6..."
4,Stacy,He is cute. Be careful.,House,She's not like you. She can't just walk into ...,0,"[-0.46382877230644226, 0.40922287106513977, 0...."


In [25]:
house_answers.to_csv('data/house_answers.csv', index=False)

In [27]:
house_answers_1 = pd.read_csv('data/house_answers.csv')

In [48]:
# import json
# house_answers_1['pooled_embeds'] = house_answers_1['pooled_embeds'].apply(lambda x: json.loads(x))