In [1]:
import os
import pickle as pkl

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import re
from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

In [2]:
import torch

from transformers import AutoTokenizer, AutoModel

In [3]:
DIR_DATA  = os.path.join(os.getcwd(), 'data')

## Загружаем и подготавливаем данные

In [5]:
#'_extended' после парсинга данных с РБК и извлечения данных из спарсенных страниц
df_train = pd.read_csv(os.path.join(DIR_DATA, 'train_extended.csv'))#, index_col= 0)
df_test  = pd.read_csv(os.path.join(DIR_DATA, 'test_extended.csv'))#, index_col= 0)

In [6]:
# sberbank-ai/sbert_large_mt_nlu_ru       1024  1.71Gb
# DeepPavlov/rubert-base-cased-sentence   768   0.7Gb
# DeepPavlov/rubert-base-cased-conversational  768
# DeepPavlov/rubert-base-cased            768
# sberbank-ai/sbert_large_nlu_ru          1024  1.71Gb

In [7]:
# should try and without it
#clean_text = lambda x:' '.join(re.sub('\n|\r|\t|[^а-я]', ' ', x.lower()).split())

In [8]:
#x = clean_text(df_train.title[0])

In [9]:
#x

In [10]:
#dir(model)

## Загружаем модель

In [11]:
#PRE_TRAINED_MODEL_NAME = 'blanchefort/rubert-base-cased-sentiment-rurewiews'
#MODEL_FOLDER = 'ru-blanchefort-rurewiews2'

#'DeepPavlov/rubert-base-cased-sentence'
#'sberbank-ai/sbert_large_mt_nlu_ru'

#PRE_TRAINED_MODEL_NAME = 'DeepPavlov/rubert-base-cased-sentence'
#MODEL_FOLDER = 'rubert-base-cased-sentence'

PRE_TRAINED_MODEL_NAME = 'sberbank-ai/sbert_large_mt_nlu_ru'
MODEL_FOLDER = 'sbert_large_mt_nlu_ru'


MAX_LENGTH = 24

In [12]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [13]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [14]:
def ttl_to_emb(inp_text):
    
    # Прямая трансляция, Фоторепортаж, Фотогалерея, Видео, телеканале РБК, Инфографика endswith
    #if inp_text.endswith('Фоторепортаж') or \
    #   inp_text.endswith('Фотогалерея') or \
    #  inp_text.endswith('Видео') or \
    #   inp_text.endswith('Инфографика'):
    #   inp_text = ' '.join(inp_text.split()[:-1])
        
    #if inp_text.endswith('Прямая трансляция') or \
    #   inp_text.endswith('телеканале РБК'):
    #    inp_text = ' '.join(inp_text.split()[:-2])
    
    
    encoded_input = tokenizer(inp_text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings[0].cpu().detach().numpy()

## Делаем эмбеддинга из заголовков. Трейн

In [15]:
df_train = df_train[['document_id', 'true_title']]

In [16]:
df_train['ttl_emb'] = df_train.true_title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/7000 [00:00<?, ?it/s]

In [17]:
PCA_COMPONENTS = 64

In [18]:
%%time
ttl_pca = PCA(n_components = PCA_COMPONENTS)
ttl_pca.fit(df_train.ttl_emb.to_list())

col_names = [f'tt_emb{idx}' for idx in range(PCA_COMPONENTS)]
emb_train = pd.DataFrame(ttl_pca.transform(df_train.ttl_emb.to_list()), columns = col_names)

Wall time: 495 ms


In [19]:
df_train = pd.concat([df_train, emb_train], axis=1)

In [20]:
df_train.drop('ttl_emb', axis = 1, inplace = True)

In [21]:
df_train.head(3)

Unnamed: 0,document_id,true_title,tt_emb0,tt_emb1,tt_emb2,tt_emb3,tt_emb4,tt_emb5,tt_emb6,tt_emb7,...,tt_emb54,tt_emb55,tt_emb56,tt_emb57,tt_emb58,tt_emb59,tt_emb60,tt_emb61,tt_emb62,tt_emb63
0,624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,0.222095,3.948613,0.358465,-0.354955,2.274236,-2.602631,-2.567613,-1.838991,...,0.613429,0.348292,-0.012083,-0.969442,0.105668,-0.262009,-0.897566,0.266145,-0.199444,0.57291
1,620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,-2.425622,0.295295,2.114864,3.053465,-1.034546,2.919111,-3.761234,2.150873,...,-0.441842,0.607779,-0.111463,-0.291918,-0.364558,0.036562,1.238123,-0.883507,-0.773571,0.087342
2,620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,6.348412,0.66958,-3.028338,-1.533043,-1.92028,-1.911492,-1.101375,-0.663394,...,0.411122,0.107282,0.161271,-0.520712,-0.410928,0.058589,0.179516,0.368834,0.507366,0.625234


Сохраняем только эмбеддинги, без остальных признаков

In [22]:
df_train.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_train_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)

## Выполняем тоже с тестом

In [23]:
df_test = df_test[['document_id', 'true_title']]

In [24]:
df_test['ttl_emb'] = df_test.true_title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/3000 [00:00<?, ?it/s]

Сокращаем размерность

In [25]:
#col_names = [f'tt_emb{idx}' for idx in range(df_test.ttl_emb[0].shape[0])]
emb_test = pd.DataFrame(ttl_pca.transform(df_test.ttl_emb.to_list()), columns = col_names)
#emb_test = pd.DataFrame(df_test.ttl_emb.to_list(), columns = col_names)

In [26]:
df_test = pd.concat([df_test, emb_test], axis=1)

In [27]:
df_test.drop('ttl_emb', axis = 1, inplace = True)

In [28]:
df_test.shape

(3000, 66)

Сохраняем только эмбеддинги, без остальных признаков

In [29]:
df_test.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_test_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)