In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

In [4]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/takimov/MLTinkoff/master/train.csv',encoding='utf-8').drop(columns='Unnamed: 0')
df.loc[df['sentiment']=='+', 'sentiment'] = 1
df.loc[df['sentiment']=='−', 'sentiment'] = -1
df.loc[df['sentiment']=='?', 'sentiment'] = 0
df[['sentence','sentiment']]

Unnamed: 0,sentence,sentiment
0,При этом всегда получал качественные услуги.,1
1,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",-1
2,"Вот так ""Мой любимый"" банк МКБ меня обманул.",-1
3,Отвратительное отношение к клиентам.,-1
4,"Всегда в любое время дня и ночи помогут, ответ...",1
...,...,...
19356,Никогда и ни в коем случае не открывайте счет ...,-1
19357,ТИ откровенно забили на качество и развивают с...,-1
19358,"Я считаю, это прорыв и лидерство финансовых ус...",1
19359,"Писал мужчина очень доходчиво, не финансовым я...",1


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False)

100%|███████████████████████████████| 995526/995526 [00:00<00:00, 1279037.66B/s]


In [8]:
model = BertModel.from_pretrained('bert-base-multilingual-cased')
model.eval()

100%|█████████████████████████| 662804195/662804195 [02:52<00:00, 3834599.23B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (de

In [9]:
df_new = pd.read_csv('https://raw.githubusercontent.com/aachubov/frnmklerafmk-qermfl-/main/test_for_participants.csv',encoding='utf-8').drop(columns='Unnamed: 0')

In [10]:
segments_ids_ans = list()
indexed_tokens_ans = list()
for i in tqdm(range(len(df_new['sentence']))):
    text = df_new['sentence'][i]
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    segments_ids = [1] * len(tokenized_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids_ans.append(segments_ids)
    indexed_tokens_ans.append(indexed_tokens)

100%|█████████████████████████████████████| 2152/2152 [00:00<00:00, 2437.68it/s]


In [11]:
sentences_count = len(df_new['sentence'])
tokens_tensor_ans = [torch.tensor([indexed_tokens_ans[i]]) for i in tqdm(range(sentences_count))]
segments_tensors_ans = [torch.tensor([segments_ids_ans[i]]) for i in tqdm(range(sentences_count))]

100%|████████████████████████████████████| 2152/2152 [00:00<00:00, 53661.88it/s]
100%|████████████████████████████████████| 2152/2152 [00:00<00:00, 60556.19it/s]


In [12]:
encoded = list()

In [13]:
with torch.no_grad():
    for i in tqdm(range(sentences_count)):
        encoded_layers, _ = model(tokens_tensor_ans[i], segments_tensors_ans[i])
        encoded.append(encoded_layers)

100%|███████████████████████████████████████| 2152/2152 [02:51<00:00, 12.54it/s]


In [14]:
token_embeddings = [torch.stack(encoded[i], dim=0) for i in tqdm(range(sentences_count))]

100%|██████████████████████████████████████| 2152/2152 [00:05<00:00, 369.01it/s]


In [15]:
token_embeddings = [torch.squeeze(token_embeddings[i], dim=1) for i in tqdm(range(sentences_count))]

100%|███████████████████████████████████| 2152/2152 [00:00<00:00, 207683.72it/s]


In [16]:
token_embeddings = [token_embeddings[i].permute(1,0,2) for i in tqdm(range(sentences_count))]

100%|███████████████████████████████████| 2152/2152 [00:00<00:00, 244544.63it/s]


In [17]:
token_vecs_sum = [[] for i in tqdm(range(sentences_count))]
for i in tqdm(range(sentences_count)):
    for token in token_embeddings[i]:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum[i].append(sum_vec)

100%|██████████████████████████████████| 2152/2152 [00:00<00:00, 1951176.44it/s]
100%|██████████████████████████████████████| 2152/2152 [00:03<00:00, 691.98it/s]


In [19]:
sentence_embedding_ans = list()
for i in tqdm(range(sentences_count)):
    token_vecs = encoded[i][11][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    sentence_embedding_ans.append(sentence_embedding)

100%|████████████████████████████████████| 2152/2152 [00:00<00:00, 19872.75it/s]


In [20]:
with open ('embeddings_test.npy', 'wb') as f:
    np.save(f, np.array(sentence_embedding_ans))