In [1]:
import os
import pickle as pkl

import numpy as np
import pandas as pd

import re
from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

In [2]:
import torch
#$from transformers import AutoModelForSequenceClassification
#from transformers import BertTokenizerFast
#from transformers import AutoTokenizer
#from transformers import Trainer, TrainingArguments

from transformers import AutoTokenizer, AutoModel

In [3]:
DIR_DATA  = os.path.join(os.getcwd(), 'data')

In [4]:
#MUSE, sbert_large_mt_nlu_ru и rubert-base-cased-sentence

## Prepare data

In [5]:
df_train = pd.read_csv(os.path.join(DIR_DATA, 'train.csv'))#, index_col= 0)
df_test  = pd.read_csv(os.path.join(DIR_DATA, 'test.csv'))#, index_col= 0)

In [6]:
# sberbank-ai/sbert_large_mt_nlu_ru       1024  1.71Gb
# DeepPavlov/rubert-base-cased-sentence   768   0.7Gb
# DeepPavlov/rubert-base-cased-conversational  768
# DeepPavlov/rubert-base-cased            768
# sberbank-ai/sbert_large_nlu_ru          1024  1.71Gb

In [7]:
# should try and without it
clean_text = lambda x:' '.join(re.sub('\n|\r|\t|[^а-я]', ' ', x.lower()).split())

In [8]:
x = clean_text(df_train.title[0])

In [9]:
x

'европейский банк развития приостановил доступ москвы и минска к финансам'

In [10]:
#dir(model)

## Load model

In [11]:
#PRE_TRAINED_MODEL_NAME = 'blanchefort/rubert-base-cased-sentiment-rurewiews'
#MODEL_FOLDER = 'ru-blanchefort-rurewiews2'


PRE_TRAINED_MODEL_NAME = 'sberbank-ai/sbert_large_mt_nlu_ru'
MODEL_FOLDER = 'sbert_large_mt_nlu_ru'


MAX_LENGTH = 24

In [12]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)


#tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#train_tokens = tokenizer(list(train.values), truncation=True, padding=True, max_length=MAX_LENGTH)
#test_tokens = tokenizer(list(test.values), truncation=True, padding=True, max_length=MAX_LENGTH)

#model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,) 

In [13]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [14]:
def ttl_to_emb(inp_text):
    encoded_input = tokenizer(inp_text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings[0].cpu().detach().numpy()

## Make embedings for titles. Train

In [15]:
df_train = df_train[['document_id', 'title']]

In [16]:
df_train['ttl_emb'] = df_train.title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/7000 [00:00<?, ?it/s]

In [17]:
col_names = [f'tt_emb{idx}' for idx in range(df_train.ttl_emb[0].shape[0])]
emb_train = pd.DataFrame(df_train.ttl_emb.to_list(), columns = col_names)

In [20]:
df_train = pd.concat([df_train, emb_train], axis=1)

In [21]:
df_train.drop('ttl_emb', axis = 1, inplace = True)

In [22]:
df_train.head(3)

Unnamed: 0,document_id,title,tt_emb0,tt_emb1,tt_emb2,tt_emb3,tt_emb4,tt_emb5,tt_emb6,tt_emb7,...,tt_emb1014,tt_emb1015,tt_emb1016,tt_emb1017,tt_emb1018,tt_emb1019,tt_emb1020,tt_emb1021,tt_emb1022,tt_emb1023
0,624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,0.75489,-0.286261,-0.958303,-0.425105,0.136802,-0.087918,-0.135857,0.102848,...,-0.610708,0.759865,0.71452,-0.34296,0.337476,-0.20103,-0.142917,-0.062627,0.502388,0.468302
1,620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,1.015678,-0.501343,-0.074484,-0.855552,-0.294566,-0.280561,0.124312,-0.416376,...,0.672917,0.939049,0.986214,-0.601762,-0.257401,0.459331,0.442573,0.037961,0.64723,0.35625
2,620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,0.54673,-0.178822,-0.223541,-0.447414,0.290502,-0.083292,0.112769,0.373367,...,0.031445,1.087985,0.562001,-0.791266,-0.502563,0.60846,0.041547,-0.169473,0.632156,0.542884


In [23]:
df_train.to_csv(os.path.join(DIR_DATA, f'ttl_emb_train_{MODEL_FOLDER}_{MAX_LENGTH}.csv'), index = False)

## Same with test

In [15]:
df_test = df_test[['document_id', 'title']]

In [16]:
df_test['ttl_emb'] = df_test.title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/3000 [00:00<?, ?it/s]

In [19]:
col_names = [f'tt_emb{idx}' for idx in range(df_test.ttl_emb[0].shape[0])]
emb_test = pd.DataFrame(df_test.ttl_emb.to_list(), columns = col_names)

In [24]:
df_test = pd.concat([df_test, emb_test], axis=1)

In [25]:
df_test.drop('ttl_emb', axis = 1, inplace = True)

KeyError: "['ttl_emb'] not found in axis"

In [26]:
df_test.shape

(3000, 1026)

In [27]:
df_test.to_csv(os.path.join(DIR_DATA, f'ttl_emb_test_{MODEL_FOLDER}_{MAX_LENGTH}.csv'), index = False)