In [1]:
import os
import pickle as pkl
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import re
#from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

from transformers import AutoTokenizer, AutoModel

In [3]:
DIR_DATA  = os.path.join(os.getcwd(), 'data')

## Загружаем и подготавливаем данные

In [4]:
#'_extended' после парсинга данных с РБК и извлечения данных из спарсенных страниц
df_train = pd.read_csv(os.path.join(DIR_DATA, 'train_extended.csv'))#, index_col= 0)
df_test  = pd.read_csv(os.path.join(DIR_DATA, 'test_extended.csv'))#, index_col= 0)

In [5]:
# sberbank-ai/sbert_large_mt_nlu_ru       1024  1.71Gb
# DeepPavlov/rubert-base-cased-sentence   768   0.7Gb
# DeepPavlov/rubert-base-cased-conversational  768
# DeepPavlov/rubert-base-cased            768
# sberbank-ai/sbert_large_nlu_ru          1024  1.71Gb

## Загружаем модель

In [6]:
#PRE_TRAINED_MODEL_NAME = 'blanchefort/rubert-base-cased-sentiment-rurewiews'
#MODEL_FOLDER = 'ru-blanchefort-rurewiews2'

#'DeepPavlov/rubert-base-cased-sentence'
#'sberbank-ai/sbert_large_mt_nlu_ru'

#PRE_TRAINED_MODEL_NAME = 'DeepPavlov/rubert-base-cased-sentence'
#MODEL_FOLDER = 'rubert-base-cased-sentence'

PRE_TRAINED_MODEL_NAME = 'sberbank-ai/sbert_large_mt_nlu_ru'
MODEL_FOLDER = 'sbert_large_mt_nlu_ru'


MAX_LENGTH = 24

In [7]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [8]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return sum_embeddings / sum_mask

In [9]:
def ttl_to_emb(inp_text: str) -> np.ndarray:
    
    encoded_input = tokenizer(inp_text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings[0].cpu().detach().numpy()

## Делаем эмбеддинга из заголовков. Трейн

In [10]:
def get_ttl_emb(inp_df: pd.DataFrame, inp_PCA: PCA, inp_names: List[str], bTrainPCA: Optional[bool] = False) -> Tuple[pd.DataFrame, PCA]:
    
    inp_df = inp_df[['document_id', 'true_title']]
    inp_df['ttl_emb'] = inp_df.true_title.progress_apply(lambda x: ttl_to_emb(x))
    
    if bTrainPCA:
        print('fitting PCA')
        inp_PCA.fit(inp_df.ttl_emb.to_list())
    
    emb_train = pd.DataFrame(inp_PCA.transform(inp_df.ttl_emb.to_list()), columns = inp_names)
    
    inp_df = pd.concat([inp_df, emb_train], axis=1)
    inp_df.drop('ttl_emb', axis = 1, inplace = True)
    
    return (inp_df, inp_PCA)

In [11]:
PCA_COMPONENTS = 64
ttl_pca = PCA(n_components = PCA_COMPONENTS)
col_names = [f'tt_emb{idx}' for idx in range(PCA_COMPONENTS)]

In [12]:
df_train, ttl_pca = get_ttl_emb(df_train, ttl_pca, col_names, True)
df_test, _ = get_ttl_emb(df_test, ttl_pca, col_names)
print(df_train.shape, df_test.shape)

100%|██████████████████████████████████████████████████████████████████████████████| 7000/7000 [24:04<00:00,  4.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inp_df['ttl_emb'] = inp_df.true_title.progress_apply(lambda x: ttl_to_emb(x))


fitting PCA


100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [10:13<00:00,  4.89it/s]

(7000, 66) (3000, 66)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inp_df['ttl_emb'] = inp_df.true_title.progress_apply(lambda x: ttl_to_emb(x))


In [13]:
df_train.head(3)

Unnamed: 0,document_id,true_title,tt_emb0,tt_emb1,tt_emb2,tt_emb3,tt_emb4,tt_emb5,tt_emb6,tt_emb7,...,tt_emb54,tt_emb55,tt_emb56,tt_emb57,tt_emb58,tt_emb59,tt_emb60,tt_emb61,tt_emb62,tt_emb63
0,624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,-0.199392,-6.211633,-2.930265,-2.131648,-5.542071,1.078423,-0.414759,-1.777834,...,0.225387,-1.217202,0.70187,-0.373144,0.448349,0.942661,-1.501726,-0.472269,0.296259,0.955972
1,620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,-0.311328,3.762958,-1.012922,4.694715,1.666159,-3.403345,2.706213,0.113333,...,0.405914,0.860465,0.697311,-0.705877,0.951299,0.41555,0.871835,-0.52084,-1.120788,-0.232487
2,620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,-6.245548,-0.188366,-3.480821,-4.11954,2.000155,6.295955,1.584439,-0.174699,...,-0.644506,0.842702,-0.398745,0.656338,0.127074,-1.540774,-0.230054,-0.026394,-0.000502,0.371672


Сохраняем только эмбеддинги, без остальных признаков

In [14]:
df_train.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_train_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)

## Выполняем тоже с тестом

Сокращаем размерность

Сохраняем только эмбеддинги, без остальных признаков

In [15]:
df_test.to_csv(os.path.join(DIR_DATA, f'ttl_cln_emb_test_{MODEL_FOLDER}_{MAX_LENGTH}_pca{PCA_COMPONENTS}.csv'), index = False)