In [1]:
import re
import string

import pandas as pd
import numpy as np
import datetime

import transformers
import torch

from nltk import WordNetLemmatizer

from tqdm.notebook import tqdm

# Загрузка и подготовка данных

In [8]:
data = pd.read_csv('../data/products.csv')
data = data.drop_duplicates()
tokenizer = transformers.BertTokenizer('../model/vocab.txt')
try:
    embedded_description = pd.read_csv('../data/embedded_description', index_col='Unnamed: 0')
    embedded_product_composition = pd.read_csv('../data/embedded_product_composition', index_col='Unnamed: 0')
    embedded_product_usage = pd.read_csv('../data/embedded_product_usage', index_col='Unnamed: 0')
    embedded_3_in_1 = pd.read_csv('../data/embedded_3_in_1')
except:
    pass

# model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel,
                                                    # transformers.DistilBertTokenizer,
                                                    # 'distilbert-base-uncased')

data.info()

  data = pd.read_csv('../data/products.csv')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40559 entries, 0 to 40579
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   40559 non-null  int64  
 1   sku                  40559 non-null  object 
 2   name                 40559 non-null  object 
 3   brand                40559 non-null  object 
 4   brand_type           40559 non-null  object 
 5   dimension17          37838 non-null  object 
 6   dimension18          39810 non-null  object 
 7   dimension19          9360 non-null   object 
 8   dimension20          9841 non-null   object 
 9   country              34488 non-null  object 
 10  price                40559 non-null  int64  
 11  currency             0 non-null      float64
 12  old_price            40559 non-null  int64  
 13  category_type        40555 non-null  object 
 14  url                  40559 non-null  object 
 15  images               40559 non-null 

In [9]:
def text_processing(text: str) -> str:
    # оставляем пропуски без изменений
    if text is np.nan:
        return np.nan
    # приводим текст к нижнему регистру
    text = text.lower()
    # заменяем символы и знаки пунктуации
    text = re.sub('\(.*?\)', '', text)
    trans_dict = str.maketrans('', '', string.punctuation)
    text = text.translate(trans_dict)
    # избавляемся от лишних пробелов
    text = ' '.join(text.split())

    return text

In [10]:
text_columns = ['description', 'product_usage', 'product_composition']

for column in text_columns:
    data[column] = data[column].apply(text_processing)

data.head()

Unnamed: 0,id,sku,name,brand,brand_type,dimension17,dimension18,dimension19,dimension20,country,...,main_product_sku,main_product_id,best_loyality_price,dimension29,dimension28,description,product_usage,product_composition,category,category_ru
0,203730,19000039636,03,Ecooking,standard,Жидкое мыло,Унисекс,,,Дания,...,19000039636,203730,,False,False,нежное мыло,намочите руки нанесите на них мыло очистите ру...,aqua sodium laureth sulfate cocamidopropyl bet...,organika,органика
1,229474,19000031882,Anti-stress,Botavikos,standard,Сыворотки,Женский,Увлажнение и питание,Лицо,Россия,...,19000031882,229474,,False,False,• пробуждает внутреннюю энергию клеток создава...,равномерно распределите на коже когда чувствуе...,aqua niacinamide glycerin gluconolactone xanth...,organika,органика
2,229480,19000031888,Dry oil,Botavikos,standard,Масло,Женский,,Лицо,Россия,...,19000031888,229480,,False,False,действие,встряхните перед использованием и распылите ма...,capryliccapric triglyceride olea europaea frui...,organika,органика
3,200485,19000046442,Catnip Chaser,Petstages,standard,игрушка для животных,,,,США,...,19000046442,200485,,False,False,игрушка трек с пластиковым мячиком тубом кошач...,подбирайте игрушки в соответствии с весом и дв...,пластик,tovary-dlja-zhivotnyh,товары для животных
4,202556,19000025382,SALT FACIAL SCRUB ORIGINAL,Kosette,standard,Скраб,Унисекс,Очищение,Лицо,,...,19000025382,202556,,False,True,нежный скраб,нанесите на чистую и влажную кожу затем аккура...,glycerin sea salt water silica cocoglucoside s...,azija,азия


In [11]:
def lower(text: str) -> str:
    if text is np.nan:
        return np.nan
    return text.lower()

In [12]:
columns_to_lower = ['name', 'brand', 'dimension17', 'dimension18', 'dimension19', 'dimension20', 'country', 'category_type']

for column in columns_to_lower:
    data[column] = data[column].apply(lower)

data.head()

Unnamed: 0,id,sku,name,brand,brand_type,dimension17,dimension18,dimension19,dimension20,country,...,main_product_sku,main_product_id,best_loyality_price,dimension29,dimension28,description,product_usage,product_composition,category,category_ru
0,203730,19000039636,03,ecooking,standard,жидкое мыло,унисекс,,,дания,...,19000039636,203730,,False,False,нежное мыло,намочите руки нанесите на них мыло очистите ру...,aqua sodium laureth sulfate cocamidopropyl bet...,organika,органика
1,229474,19000031882,anti-stress,botavikos,standard,сыворотки,женский,увлажнение и питание,лицо,россия,...,19000031882,229474,,False,False,• пробуждает внутреннюю энергию клеток создава...,равномерно распределите на коже когда чувствуе...,aqua niacinamide glycerin gluconolactone xanth...,organika,органика
2,229480,19000031888,dry oil,botavikos,standard,масло,женский,,лицо,россия,...,19000031888,229480,,False,False,действие,встряхните перед использованием и распылите ма...,capryliccapric triglyceride olea europaea frui...,organika,органика
3,200485,19000046442,catnip chaser,petstages,standard,игрушка для животных,,,,сша,...,19000046442,200485,,False,False,игрушка трек с пластиковым мячиком тубом кошач...,подбирайте игрушки в соответствии с весом и дв...,пластик,tovary-dlja-zhivotnyh,товары для животных
4,202556,19000025382,salt facial scrub original,kosette,standard,скраб,унисекс,очищение,лицо,,...,19000025382,202556,,False,True,нежный скраб,нанесите на чистую и влажную кожу затем аккура...,glycerin sea salt water silica cocoglucoside s...,azija,азия


# Генерация эмбеддингов

In [124]:
def lemmatization(data):
    if data is np.nan:
        return ''
    return ' '.join([WordNetLemmatizer().lemmatize(word) for word in data.split()])

Загрузим RuBERT для генерации эмбеддингов

In [13]:
config = transformers.BertConfig.from_json_file(
    '../model/bert_config.json')
model = transformers.BertModel.from_pretrained(
    '../model/pytorch_model.bin', config=config).to('cuda:0')

Some weights of the model checkpoint at ../model/pytorch_model.bin were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
batch_size = 1

for column in text_columns:
    text = data[column].fillna('')
    # lemmas = data[column].apply(lemmatization)
    vector = text.apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512))
    # применим padding к векторам
    n = len(max(vector, key=len))
    # англ. вектор с отступами
    padded = np.array([i + [0]*(n - len(i)) for i in vector.values])

    # создадим маску для важных токенов
    attention_mask = np.where(padded != 0, 1, 0)

    embeddings = []
    for i in tqdm(range(padded.shape[0] // batch_size)):
        # преобразуем данные
        batch = torch.LongTensor(padded[batch_size*i : batch_size*(i+1)]).to('cuda:0')
        # преобразуем маску
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i : batch_size*(i+1)]).to('cuda:0')
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)

        # преобразуем элементы методом numpy() к типу numpy.array
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

    features = pd.DataFrame(np.concatenate(embeddings))
    features.to_csv(f'../data/embedded_{column}_no_lemmas', index=False)


  0%|          | 0/40559 [00:00<?, ?it/s]

  0%|          | 0/40559 [00:00<?, ?it/s]

  0%|          | 0/40559 [00:00<?, ?it/s]

In [16]:
batch_size = 1
text = []
# заменим пропуски в полях на пустую строку, для корректной генерации токенов
for d, pu, pc in data[['description', 'product_usage', 'product_composition']].values:
    if d is np.nan:
        d = ' '
    if pu is np.nan:
        pu = ' '
    if pc is np.nan:
        pc = ' '
    text.append(d + pu + pc)

data['3_in_1'] = text
text = data['3_in_1'].fillna('')
# lemmas = data['3_in_1'].apply(lemmatization)
vector = text.apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512))
# применим padding к векторам
n = len(max(vector, key=len))
# англ. вектор с отступами
padded = np.array([i + [0]*(n - len(i)) for i in vector.values])

# создадим маску для важных токенов
attention_mask = np.where(padded != 0, 1, 0)

embeddings = []
for i in tqdm(range(padded.shape[0] // batch_size)):
    # преобразуем данные
    batch = torch.LongTensor(padded[batch_size*i : batch_size*(i+1)]).to('cuda:0')
    # преобразуем маску
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i : batch_size*(i+1)]).to('cuda:0')
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)

    # преобразуем элементы методом numpy() к типу numpy.array
    embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

features = pd.DataFrame(np.concatenate(embeddings))

features.to_csv(f'../data/embedded_3_in_1_no_lemma', index=False)

  0%|          | 0/40559 [00:00<?, ?it/s]

In [13]:
embedded_description.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.119271,0.136464,-0.008355,0.026413,0.099878,0.022849,-0.137604,0.196135,0.13059,0.283869,...,0.015723,0.044034,-0.101652,-0.010436,-0.07342,0.046543,0.078297,-0.060893,0.071176,-0.12658
1,0.438349,-0.415116,0.85278,-0.137081,-0.502088,0.008114,-0.632642,-0.282202,-0.164272,-0.228508,...,0.223047,0.003482,-0.43483,-0.219574,-0.037204,0.345793,-0.080911,0.495437,0.50058,0.012712
2,0.065119,0.017991,0.075553,-0.02885,0.134829,0.002659,0.054221,0.103911,0.126278,0.249699,...,0.001731,0.025728,-0.057699,0.003478,0.028238,0.210666,-0.004904,0.004242,0.091198,-0.19705
3,0.018086,-0.016375,0.492378,-0.407476,-0.094135,-0.257374,-0.079157,0.205838,-0.196295,-0.033481,...,0.098162,-0.044465,-0.297261,-0.297032,-0.079317,-0.214961,0.618069,0.343269,-0.123517,-0.134644
4,0.177492,0.092044,0.084529,-0.081504,0.267774,0.140675,-0.129266,0.110915,0.39659,0.027469,...,-0.055115,0.156429,0.088761,0.14044,-0.070732,0.089746,0.000989,-0.047724,0.266879,-0.184299
