In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2022-09-11T17:01:49.553650+03:00

Python implementation: CPython
Python version       : 3.10.4
IPython version      : 8.4.0

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.15.0-47-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit



In [3]:
import time
notebookstart = time.time()

In [4]:
import os
import pickle as pkl

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import multiprocessing as mp

import re
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
# os.environ["TOKENIZERS_PARALLELISM"] = 'true'

In [6]:
import torch
from transformers import AutoTokenizer, AutoModel

In [7]:
#from functools import partial
#from embtitile import embtitile as et

In [8]:
#import ray
#ray.init()

Переменные

In [9]:
DIR_DATA  = os.path.join(os.getcwd(), 'data')

## Загружаем и подготавливаем данные

In [10]:
#'_extended' после парсинга данных с РБК и извлечения данных из спарсенных страниц
df_train = pd.read_csv(os.path.join(DIR_DATA, 'train_extended.csv'))
df_test  = pd.read_csv(os.path.join(DIR_DATA, 'test_extended.csv'))

In [11]:
#имя            размерность выходного вектора   вес модели
# sberbank-ai/sbert_large_mt_nlu_ru       1024  1.71Gb
# DeepPavlov/rubert-base-cased-sentence   768   0.7Gb
# DeepPavlov/rubert-base-cased-conversational  768
# DeepPavlov/rubert-base-cased            768
# sberbank-ai/sbert_large_nlu_ru          1024  1.71Gb

## Загружаем модель

In [12]:
#PRE_TRAINED_MODEL_NAME = 'blanchefort/rubert-base-cased-sentiment-rurewiews'
#MODEL_FOLDER = 'ru-blanchefort-rurewiews2'

#'DeepPavlov/rubert-base-cased-sentence'
#'sberbank-ai/sbert_large_mt_nlu_ru'

#PRE_TRAINED_MODEL_NAME = 'DeepPavlov/rubert-base-cased-sentence'
#MODEL_FOLDER = 'rubert-base-cased-sentence'

PRE_TRAINED_MODEL_NAME = 'sberbank-ai/sbert_large_mt_nlu_ru'
MODEL_FOLDER = 'sbert_large_mt_nlu_ru'


MAX_LENGTH = 24

In [13]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

_ = model.cpu()

In [14]:
#dir(tokenizer)

In [15]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return sum_embeddings / sum_mask

In [16]:
def ttl_to_emb(inp_text: str) -> np.ndarray:
    
    encoded_input = tokenizer(inp_text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings[0].cpu().detach().numpy()

## Делаем эмбеддинга из заголовков. Трейн

In [17]:
df_train = df_train[['document_id', 'true_title']]

In [18]:
df_train['ttl_emb'] = df_train.true_title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/7000 [00:00<?, ?it/s]

In [19]:
PCA_COMPONENTS = 64

In [20]:
%%time
ttl_pca = PCA(n_components = PCA_COMPONENTS)
ttl_pca.fit(df_train.ttl_emb.to_list())

col_names = [f'tt_emb{idx}' for idx in range(PCA_COMPONENTS)]
emb_train = pd.DataFrame(ttl_pca.transform(df_train.ttl_emb.to_list()), columns = col_names)

CPU times: user 2.72 s, sys: 96 ms, total: 2.82 s
Wall time: 1.2 s


In [21]:
df_train = pd.concat([df_train, emb_train], axis=1)

In [22]:
df_train.drop('ttl_emb', axis = 1, inplace = True)

In [23]:
df_train.head(3)

Unnamed: 0,document_id,true_title,tt_emb0,tt_emb1,tt_emb2,tt_emb3,tt_emb4,tt_emb5,tt_emb6,tt_emb7,...,tt_emb54,tt_emb55,tt_emb56,tt_emb57,tt_emb58,tt_emb59,tt_emb60,tt_emb61,tt_emb62,tt_emb63
0,624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,-0.199399,-6.211632,-2.930263,-2.131648,-5.542072,1.078428,-0.414764,-1.777834,...,0.235061,-1.248508,0.723421,-0.330917,0.487549,0.895777,-1.563121,-0.574027,0.395415,0.879678
1,620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,-0.311326,3.762954,-1.01292,4.694713,1.666154,-3.403344,2.706218,0.113334,...,0.382633,0.862265,0.69595,-0.71526,0.898461,0.371864,0.805597,-0.379342,-1.202755,-0.232504
2,620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,-6.245545,-0.18836,-3.48082,-4.119542,2.000154,6.295955,1.584439,-0.174699,...,-0.657501,0.854272,-0.417625,0.63227,0.097934,-1.525795,-0.279884,0.053434,-0.012992,0.363561


Сохраняем только эмбеддинги, без остальных признаков

In [24]:
df_train.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_train_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)

## Выполняем тоже с тестом

In [25]:
df_test = df_test[['document_id', 'true_title']]

In [26]:
df_test['ttl_emb'] = df_test.true_title.progress_apply(lambda x: ttl_to_emb(x))

  0%|          | 0/3000 [00:00<?, ?it/s]

Сокращаем размерность

In [27]:
#col_names = [f'tt_emb{idx}' for idx in range(df_test.ttl_emb[0].shape[0])]
emb_test = pd.DataFrame(ttl_pca.transform(df_test.ttl_emb.to_list()), columns = col_names)
#emb_test = pd.DataFrame(df_test.ttl_emb.to_list(), columns = col_names)

In [28]:
df_test = pd.concat([df_test, emb_test], axis=1)

In [29]:
df_test.drop('ttl_emb', axis = 1, inplace = True)

In [30]:
df_test.shape

(3000, 66)

Сохраняем только эмбеддинги, без остальных признаков

In [31]:
df_test.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_test_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)

In [32]:
#ray.shutdown()

In [33]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

Notebook Runtime: 25.77 Minutes
