In [2]:
#Basic libraries
import pandas as pd 
import numpy as np 

#Visualization libraries
import matplotlib.pyplot as plt 
import seaborn as sns

#NLTK libraries
import nltk
import string
import pymorphy2
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
nltk.download('stopwords')
nltk.download('punkt')


# Machine Learning libraries
import sklearn 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#Metrics libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

import joblib

[nltk_data] Downloading package stopwords to /home/artem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/artem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
import datasets
from datasets import DatasetDict, Dataset

from transformers import AutoTokenizer, AutoModel

import torch

import umap.umap_ as umap
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [3]:
df_categories_tree = pd.read_csv('categories_tree.csv')

In [4]:
df_categories_tree.head(5)

Unnamed: 0,id,title,parent_id
0,1,Все категории,0
1,114,Урбеч,1913
2,115,Варенье и джемы,328
3,128,Сухие завтраки,2475
4,131,Масла,2475


In [5]:
df_train = pd.read_parquet('train.parquet')  

In [6]:
df_train.head(5)

Unnamed: 0,id,title,short_description,name_value_characteristics,rating,feedback_quantity,category_id
0,1267423,Muhle Manikure Песочные колпачки для педикюра ...,Muhle Manikure Колпачок песочный шлифовальный ...,,0.0,0,2693
1,128833,"Sony Xperia L1 Защитное стекло 2,5D",,,4.666667,9,13408
2,569924,"Конверт для денег Прекрасная роза, 16,5 х 8 см","Конверт для денег «Прекрасная роза», 16,5 × 8 см",,5.0,6,11790
3,1264824,Серьги,,,0.0,0,14076
4,1339052,Наклейки на унитаз для туалета на крышку бачок...,"Водостойкая, интересная наклейка на унитаз раз...",,0.0,0,12401


In [7]:
df_test = pd.read_parquet('test.parquet')

In [8]:
df_test.head()

Unnamed: 0,id,title,short_description,name_value_characteristics,rating,feedback_quantity
0,1070974,Браслет из натуральных камней LOTUS,,,0.0,0
1,450413,Fusion Life - Шампунь для сухих и окрашенных в...,,,4.333333,6
2,126857,"Микрофон для ПК jack 3,5мм всенаправленный","универсальный 3,5 мм микрофон запишет ваш звук",,3.708333,24
3,1577569,Серьги гвоздики сердце,Серьги гвоздики сердце,,0.0,0
4,869328,"Чёрно-красная стильная брошь ""Тюльпаны"" из акр...",Стильная и яркая брошь ручной работы! Великоле...,,0.0,0


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283452 entries, 0 to 283451
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          283452 non-null  int64  
 1   title                       283452 non-null  object 
 2   short_description           133130 non-null  object 
 3   name_value_characteristics  50360 non-null   object 
 4   rating                      283452 non-null  float64
 5   feedback_quantity           283452 non-null  int64  
 6   category_id                 283452 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 15.1+ MB


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70864 entries, 0 to 70863
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          70864 non-null  int64  
 1   title                       70864 non-null  object 
 2   short_description           33346 non-null  object 
 3   name_value_characteristics  12576 non-null  object 
 4   rating                      70864 non-null  float64
 5   feedback_quantity           70864 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 3.2+ MB


In [11]:
#checking the null in data
df_train.isnull().sum() 

id                                 0
title                              0
short_description             150322
name_value_characteristics    233092
rating                             0
feedback_quantity                  0
category_id                        0
dtype: int64

In [12]:
df_test.isnull().sum() 

id                                0
title                             0
short_description             37518
name_value_characteristics    58288
rating                            0
feedback_quantity                 0
dtype: int64

In [9]:
df_train['category_id'].nunique()

1231

In [10]:
df_train['category_id'].value_counts()

11937    14967
13408     7153
13061     6434
13143     6145
13253     3390
         ...  
13756        2
13007        2
2598         2
11917        2
13787        2
Name: category_id, Length: 1231, dtype: int64

In [11]:
cat_ids_list = sorted(list(set(df_train['category_id'])))
df_train['category_id_label'] = df_train['category_id'].apply(lambda cat_id: cat_ids_list.index(cat_id))

In [12]:
cat_ids_list[36]

2693

# Extract all root-to-leaf paths

In [13]:
import networkx as nx

# build the graph
G = nx.from_pandas_edgelist(df_categories_tree, source='parent_id', target='id',
                            create_using=nx.DiGraph)
# map id to name
node_names = df_categories_tree.set_index('id')['title'].to_dict()

# get path from root (0) to the node
def get_path(node):
    # this is a tree, so exactly one simple path for each node
    for path in nx.simple_paths.all_simple_paths(G, 0, node):
        return [node_names.get(i) for i in path[1:]]

df_categories_tree['path'] = df_categories_tree['id'].apply(get_path)

In [14]:
df_train = df_train.drop('id',axis=1)

In [15]:
df_train = pd.merge(df_train, df_categories_tree[['id','path']], how = 'left', 
         left_on = 'category_id', right_on = 'id').drop('id',axis=1)

In [16]:
df_train.head()

Unnamed: 0,title,short_description,name_value_characteristics,rating,feedback_quantity,category_id,category_id_label,path
0,Muhle Manikure Песочные колпачки для педикюра ...,Muhle Manikure Колпачок песочный шлифовальный ...,,0.0,0,2693,36,"[Все категории, Красота, Маникюр и педикюр, Ин..."
1,"Sony Xperia L1 Защитное стекло 2,5D",,,4.666667,9,13408,856,"[Все категории, Электроника, Смартфоны и телеф..."
2,"Конверт для денег Прекрасная роза, 16,5 х 8 см","Конверт для денег «Прекрасная роза», 16,5 × 8 см",,5.0,6,11790,207,"[Все категории, Товары для дома, Товары для пр..."
3,Серьги,,,0.0,0,14076,1116,"[Все категории, Аксессуары, Женские аксессуары..."
4,Наклейки на унитаз для туалета на крышку бачок...,"Водостойкая, интересная наклейка на унитаз раз...",,0.0,0,12401,460,"[Все категории, Товары для дома, Декор и интер..."


In [17]:
df_train['short_description'] = df_train['short_description'].fillna("")

In [18]:
# Concatenate title and short_description columns 
df_train['title+short_description'] = df_train['title'] + " " + df_train['short_description']

# Features and Label

In [19]:
X = df_train['title+short_description']
y = df_train['category_id_label']

# Train Test Split

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

# Text preprocessing

In [22]:
def preprocess(text):
    
    # Lowercase
    text = text.lower()
    
    # Removing Punctuation
    text_p = text.translate(str.maketrans('', '', string.punctuation + '«»'))
    
    return text_p 

In [23]:
X_train_preprocessed = X_train.apply(preprocess)
X_val_preprocessed = X_val.apply(preprocess)

# Vectorization

In [24]:
vectorization = TfidfVectorizer()
X_train_tfidf = vectorization.fit_transform(X_train_preprocessed)
X_val_tfidf = vectorization.transform(X_val_preprocessed)

# LogisticRegression (multinomial)

In [23]:
log = LogisticRegression(solver='saga', multi_class='multinomial',verbose=10, max_iter=1000)
log.fit(X_train_tfidf,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 25 epochs took 590 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.9min finished


LogisticRegression(max_iter=1000, multi_class='multinomial', solver='saga',
                   verbose=10)

In [25]:
# save the model
filename = 'log_reg_model1.sav'

In [None]:
joblib.dump(log, filename)

# Performance Evaluation

In [27]:
loaded_model = joblib.load(filename)

In [28]:
preds_labels = loaded_model.predict(X_val_tfidf)

In [29]:
preds = []
for label in preds_labels:
    preds.append(cat_ids_list[label])

In [30]:
y_val_cat = []
for i in y_val:
    y_val_cat.append(cat_ids_list[i])

In [31]:
print(classification_report(y_val_cat,preds))

              precision    recall  f1-score   support

        2598       0.00      0.00      0.00         1
        2599       0.93      0.99      0.96        77
        2600       1.00      0.70      0.82        10
        2601       0.55      0.38      0.44        64
        2602       0.78      0.78      0.78        27
        2603       0.00      0.00      0.00         1
        2604       0.00      0.00      0.00         2
        2605       1.00      0.17      0.30        23
        2607       0.84      0.81      0.82       109
        2608       0.60      0.25      0.35        12
        2610       0.83      0.93      0.88       160
        2631       0.94      0.84      0.89       108
        2632       0.96      0.94      0.95        50
        2633       0.00      0.00      0.00         6
        2634       0.62      0.98      0.76       163
        2635       1.00      0.57      0.73        28
        2636       0.86      0.89      0.88        64
        2662       0.93    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Calculation of hierarchical metrics 

In [32]:
results = pd.DataFrame(preds,columns = ['Predicted Label'])

In [33]:
results['Predicted Path'] = results['Predicted Label'].apply(get_path)

In [34]:
results['True Label'] = y_val_cat

In [35]:
results['True Path'] = results['True Label'].apply(get_path)

In [36]:
results

Unnamed: 0,Predicted Label,Predicted Path,True Label,True Path
0,12171,"[Все категории, Электроника, Смартфоны и телеф...",12171,"[Все категории, Электроника, Смартфоны и телеф..."
1,13495,"[Все категории, Товары для дома, Товары для пр...",13495,"[Все категории, Товары для дома, Товары для пр..."
2,13408,"[Все категории, Электроника, Смартфоны и телеф...",13408,"[Все категории, Электроника, Смартфоны и телеф..."
3,12128,"[Все категории, Одежда, Женская одежда, Шорты ...",12128,"[Все категории, Одежда, Женская одежда, Шорты ..."
4,12049,"[Все категории, Электроника, Аксессуары для эл...",12171,"[Все категории, Электроника, Смартфоны и телеф..."
...,...,...,...,...
56686,12781,"[Все категории, Электроника, Смартфоны и телеф...",12781,"[Все категории, Электроника, Смартфоны и телеф..."
56687,14434,"[Все категории, Красота, Парфюмерия, Миниатюры...",14434,"[Все категории, Красота, Парфюмерия, Миниатюры..."
56688,13143,"[Все категории, Одежда, Женская одежда, Колгот...",13143,"[Все категории, Одежда, Женская одежда, Колгот..."
56689,2895,"[Все категории, Товары для дома, Мебель, Пуфик...",2895,"[Все категории, Товары для дома, Мебель, Пуфик..."


In [37]:
categories = df_categories_tree.set_index('id')
categories

Unnamed: 0_level_0,title,parent_id,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Все категории,0,[Все категории]
114,Урбеч,1913,"[Все категории, Продукты питания, Здоровое пит..."
115,Варенье и джемы,328,"[Все категории, Продукты питания, Мед, варенье..."
128,Сухие завтраки,2475,"[Все категории, Продукты питания, Бакалея, Сух..."
131,Масла,2475,"[Все категории, Продукты питания, Бакалея, Масла]"
...,...,...,...
14555,Насадки и запчасти,11691,"[Все категории, Бытовая техника, Товары для ку..."
14556,Швейные машины,10062,"[Все категории, Бытовая техника, Техника для д..."
14557,Матрасы,2894,"[Все категории, Товары для дома, Мебель, Матрасы]"
14558,Ледянки и тюбинги,10092,"[Все категории, Спорт и отдых, Зимний спорт, Л..."


In [38]:
# Calculate hierarchical metrics for every category_id
cat_ids = list(results.groupby('True Label').groups.keys())

dict_with_f1 = {}

for cat_id in cat_ids:
    
    cat_id_group_df = results[(results['True Label'] == cat_id) | (results['Predicted Label'] == cat_id)].copy()
    
    true_path_set = set(categories['path'].loc[cat_id])
    func_1 = lambda v: len(set(v).intersection(true_path_set))
    func_2 = lambda t, p: len(set(t).intersection(set(p)))
    
    cat_id_group_df['|Ti|'] = cat_id_group_df['True Path'].apply(func_1)
    cat_id_group_df['|Pi|'] = cat_id_group_df['Predicted Path'].apply(func_1)
    cat_id_group_df['|Pi Ω Ti|'] = cat_id_group_df[['True Path', 'Predicted Path']].apply(lambda x: func_2(*x), 
                                                                                          axis=1)
    
    hP = (cat_id_group_df['|Pi Ω Ti|'].sum()) / (cat_id_group_df['|Pi|'].sum())
    hR = (cat_id_group_df['|Pi Ω Ti|'].sum()) / (cat_id_group_df['|Ti|'].sum())
    hF = (2*hP*hR)/(hP+hR)
    
    dict_with_f1[cat_id] = hF

In [39]:
len(dict_with_f1.keys())

1146

In [40]:
hf1_per_category = pd.DataFrame.from_dict({ "category_id": list(dict_with_f1.keys()),
  "hF1": list(dict_with_f1.values())})

In [41]:
hf1_per_category

Unnamed: 0,category_id,hF1
0,2598,0.888889
1,2599,0.980344
2,2600,0.936170
3,2601,0.841379
4,2602,0.952381
...,...,...
1141,14551,0.816901
1142,14552,0.829268
1143,14553,0.933333
1144,14557,0.666667


In [42]:
items_per_category = results.groupby('True Label').count()

In [43]:
items_per_category = items_per_category.reset_index()

In [44]:
df_for_cal_of_weighted_hF1 = pd.merge(hf1_per_category, items_per_category[['True Label','Predicted Label']], how = 'left', 
         left_on = 'category_id', right_on = 'True Label').drop('True Label',axis=1)
df_for_cal_of_weighted_hF1.rename(columns = {'Predicted Label':'count'}, inplace = True)

In [45]:
df_for_cal_of_weighted_hF1

Unnamed: 0,category_id,hF1,count
0,2598,0.888889,1
1,2599,0.980344,77
2,2600,0.936170,10
3,2601,0.841379,64
4,2602,0.952381,27
...,...,...,...
1141,14551,0.816901,16
1142,14552,0.829268,4
1143,14553,0.933333,4
1144,14557,0.666667,1


In [46]:
weighted_hF1 = (df_for_cal_of_weighted_hF1['count'] * df_for_cal_of_weighted_hF1['hF1']).sum() / len(results)
print('Weighted hierarchical F1 is equal to', f'{weighted_hF1*100:.2f}%')

Weighted hierarchical F1 is equal to 92.42%


# Predictions 

In [47]:
df_test['short_description'] = df_test['short_description'].fillna("")
df_test['title+short_description'] = df_test['title'] + df_test['short_description']

In [48]:
X_test = df_test['title+short_description'].apply(preprocess)

In [49]:
X_test_tfidf = vectorization.transform(X_test)

In [50]:
X_test_tfidf

<70864x127245 sparse matrix of type '<class 'numpy.float64'>'
	with 568452 stored elements in Compressed Sparse Row format>

In [52]:
predictions = loaded_model.predict(X_test_tfidf)

In [53]:
predictions

array([ 130,  242,  809, ...,  820,  721, 1098])

In [54]:
final_predictions = []
for pred in predictions:
    final_predictions.append(cat_ids_list[pred])

In [55]:
df_test['predicted_category_id'] = final_predictions

In [56]:
result_table = df_test[['id','predicted_category_id']]
result_table

Unnamed: 0,id,predicted_category_id
0,1070974,11574
1,450413,11878
2,126857,13299
3,1577569,13061
4,869328,12813
...,...,...
70859,967535,13143
70860,1488636,12350
70861,827510,13324
70862,529244,13069


In [57]:
result_table.to_parquet('result.parquet',index=False)

In [58]:
pd.read_parquet('result.parquet')  

Unnamed: 0,id,predicted_category_id
0,1070974,11574
1,450413,11878
2,126857,13299
3,1577569,13061
4,869328,12813
...,...,...
70859,967535,13143
70860,1488636,12350
70861,827510,13324
70862,529244,13069


# TRANSFORMERS

## The Dataset

In [59]:
train = pd.DataFrame(zip(X_train, y_train), columns=[X_train.name, "labels"])
val = pd.DataFrame(zip(X_val, y_val), columns=[X_val.name, "labels"])

# Create a DatasetDict object and add to it train, validation and test datasets 
dataset = DatasetDict()
dataset["train"] = Dataset.from_pandas(train)
dataset["validation"] = Dataset.from_pandas(val)

In [60]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title+short_description', 'labels'],
        num_rows: 226761
    })
    validation: Dataset({
        features: ['title+short_description', 'labels'],
        num_rows: 56691
    })
})

## Tokenization of the dataset

In [61]:
model_ckpt = "bert-base-multilingual-uncased"

In [62]:
# Load the tokenizer associated with a pretrained model. We use BERT base multilingual model (uncased)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [63]:
# Check the model's maximum content size
tokenizer.model_max_length

512

In [64]:
# Check the names of the fields that the model expects in its forward pass
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [65]:
# Create a function that applies the tokenizer to a batch of examples,padding=True will pad the examples with 
# zeros to the size of the longest one in a batch, and truncation=True will truncate the examples to the model’s 
# maximum context size

def tokenize(batch):
    return tokenizer(batch["title+short_description"], padding=True, truncation=True)

In [66]:
# We can apply this function across all dataset
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [67]:
# This operation has added new input_ids, attention_mask and token_type_ids columns to the dataset

print(dataset_encoded["train"].column_names)

['title+short_description', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


## Training a Text Classifier

We train a model with fine-tuning: end-to-end, which also updates the parameters of the pretrained model.

### Fine-Tuning Transformers

In [None]:
import os
os.environ["WANDB_START_METHOD"] = "thread"

In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 1231
model = (AutoModelForSequenceClassification
         .from_pretrained('bert-base-multilingual-uncased', num_labels=num_labels)
         .to(device))

In [73]:
# We will monitor the F1-score and the accuracy of the model during training. 

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [76]:
batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-items-classification"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  lr_scheduler_type="linear",
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="eval_loss",
                                  greater_is_better=False,
                                  weight_decay=0.01,
                                  save_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  report_to="none",
                                  push_to_hub=False)

In [77]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.1921,2.133701,0.661675,0.581314


TrainOutput(global_step=3544, training_loss=3.191546302112177, metrics={'train_runtime': 2776.7598, 'train_samples_per_second': 81.664, 'train_steps_per_second': 1.276, 'total_flos': 2.1913739902746756e+16, 'train_loss': 3.191546302112177, 'epoch': 1.0})

In [78]:
preds_output = trainer.predict(dataset_encoded['validation'])

In [79]:
preds_output.metrics

{'test_loss': 2.1337006092071533,
 'test_accuracy': 0.6616746926319874,
 'test_f1': 0.5813144041747497,
 'test_runtime': 167.946,
 'test_samples_per_second': 337.555,
 'test_steps_per_second': 5.276}

Due to limited compute power, only one training epoch was completed. Only 1 epoch of training of the transformer-based model resulted in a worse result in terms of F1 score compared to the first approach (Tf-idf + Miltinomial Logistic Regression). Therefore, it was decided to choose the first approach for predic of the test set.  