In [2]:
import pathlib
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    cross_val_predict,
)

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    classification_report,
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "../../data"
RANDOM_SEED = 42

## Data load and review

In [3]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model training

### Data preprocessing

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df['emb'] = df['text'] + '\ntags: ' + df['tags']

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("deepvk/USER-base")

In [9]:
vectors = model.encode(df['emb'].to_list())

In [10]:
X, y = vectors, df[[f"trend_id_res{i}" for i in range(50)]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (3698, 768)
y_train.shape is (3698, 50)
X_test.shape is (925, 768)
y_test.shape is (925, 50)


### Checking quality on train data

In [13]:
from catboost import CatBoostClassifier

In [37]:
X_train.shape

(3698, 768)

In [38]:
from catboost import Pool

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [54]:
clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    eval_metric='Accuracy',
    class_names=[f"trend_id_res{i}" for i in range(50)],
    iterations=200
)
clf.fit(train_pool, eval_set=test_pool, metric_period=1, plot=True, verbose=50)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.080001
0:	learn: 0.0016225	test: 0.0000000	best: 0.0000000 (0)	total: 2.74s	remaining: 11m 22s
50:	learn: 0.2174148	test: 0.1578378	best: 0.1578378 (50)	total: 2m 4s	remaining: 8m 7s
100:	learn: 0.6406165	test: 0.2583784	best: 0.2583784 (99)	total: 4m 6s	remaining: 6m 3s
150:	learn: 0.8585722	test: 0.3167568	best: 0.3167568 (145)	total: 6m 5s	remaining: 3m 59s
200:	learn: 0.9515955	test: 0.3178378	best: 0.3189189 (195)	total: 8m 4s	remaining: 1m 58s
249:	learn: 0.9851271	test: 0.3308108	best: 0.3308108 (249)	total: 10m	remaining: 0us

bestTest = 0.3308108108
bestIteration = 249



<catboost.core.CatBoostClassifier at 0x756385966890>

In [56]:
y_pred = clf.predict(X_train)

In [57]:
accuracy_score(y_train, y_pred)

0.9851270957274202

### Training the final model

In [14]:
from catboost import Pool

pool = Pool(X, y)

In [15]:
clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    eval_metric='Accuracy',
    class_names=[f"trend_id_res{i}" for i in range(50)],
    iterations=200
)
clf.fit(pool, metric_period=1, plot=True, verbose=10)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.086659
0:	learn: 0.0015142	total: 2.67s	remaining: 8m 50s
10:	learn: 0.0129786	total: 27.3s	remaining: 7m 48s
20:	learn: 0.0523470	total: 51.7s	remaining: 7m 20s
30:	learn: 0.1111832	total: 1m 16s	remaining: 6m 56s
40:	learn: 0.1784555	total: 1m 41s	remaining: 6m 32s
50:	learn: 0.2437811	total: 2m 5s	remaining: 6m 7s
60:	learn: 0.3184080	total: 2m 30s	remaining: 5m 42s
70:	learn: 0.3984426	total: 2m 55s	remaining: 5m 17s
80:	learn: 0.4847502	total: 3m 19s	remaining: 4m 53s
90:	learn: 0.5608912	total: 3m 44s	remaining: 4m 29s
100:	learn: 0.6303266	total: 4m 9s	remaining: 4m 4s
110:	learn: 0.6882976	total: 4m 33s	remaining: 3m 39s
120:	learn: 0.7348042	total: 4m 58s	remaining: 3m 14s
130:	learn: 0.7709280	total: 5m 22s	remaining: 2m 49s
140:	learn: 0.8051049	total: 5m 46s	remaining: 2m 24s
150:	learn: 0.8362535	total: 6m 10s	remaining: 2m
160:	learn: 0.8671858	total: 6m 34s	remaining: 1m 35s
170:	learn: 0.8870863	total: 6m 58s	remaining: 1m 11s
180:	learn: 0.904391

<catboost.core.CatBoostClassifier at 0x765cd0205870>

### Preddiction and downloading the solution

In [16]:
test = pd.read_csv(DATA_DIR / "test.csv").fillna('')

In [17]:
test.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text
0,1,3135,3.0,{DELIVERY},"Последнее время думаю плохо, сроки доставки да..."
1,3,4655,2.0,"{PRICE,DELIVERY,ASSORTMENT}",Цены намного выше магазинных но радуют акции
2,5,22118,2.0,"{CATALOG_NAVIGATION,ASSORTMENT,DELIVERY}","Доставка за [NUM] минут, заказ даже не начали ..."
3,7,23511,0.0,{DELIVERY},Ужасно долгая доставка
4,8,45,6.0,"{ASSORTMENT,PROMOTIONS}",Добрый вечер! Вы большие молодцы. Меня всё уст...


In [18]:
(test['text'] + '\ntags: ' + test['tags'])

0       Последнее время думаю плохо, сроки доставки да...
1       Цены намного выше магазинных но радуют акции\n...
2       Доставка за [NUM] минут, заказ даже не начали ...
3                Ужасно долгая доставка\ntags: {DELIVERY}
4       Добрый вечер! Вы большие молодцы. Меня всё уст...
                              ...                        
9010    Задержка с доставкой не даете промокод на скид...
9011    Очень удобный формат сервиса и очень маленький...
9012    Сумма заказа почти всегда высокая, что зачасту...
9013    Часто, заказываю у вас молочную продукцию, при...
9014    Изначально подкупало то, что заказ от [NUM] ру...
Length: 9015, dtype: object

In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9015 entries, 0 to 9014
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  9015 non-null   int64  
 1   index       9015 non-null   int64  
 2   assessment  9015 non-null   float64
 3   tags        9015 non-null   object 
 4   text        9015 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 352.3+ KB


In [20]:
test_ve = model.encode((test['text'] + '\ntags: ' + test['tags']).to_list())

In [21]:
pred_test = clf.predict(test_ve)

In [22]:
res = pd.DataFrame(
    np.hstack([test["index"].values.reshape(test.shape[0], 1), pred_test]),
    columns=["index"]+[f"trend_id_res{i}" for i in range(50)])

In [23]:
res.head()

Unnamed: 0,index,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,trend_id_res6,trend_id_res7,trend_id_res8,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22118,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
target = ""
for i in range(50):
    def add_str(val: int) -> str:
        return "" if val == 0 else " " + str(i)
    
    target += res[f'trend_id_res{i}'].apply(add_str)

In [25]:
target = target.apply(str.strip)

In [26]:
res['target'] = target

In [27]:
res[['index', 'target']].to_csv("submission2.csv", index=False)