In [1]:
import pathlib
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    cross_val_predict,
)

from sklearn.metrics import (
    f1_score,
    accuracy_score,
    classification_report,
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "../../data"
RANDOM_SEED = 42

## Data load and review

In [2]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model training

### Data preprocessing

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X, y = df[['text']], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (3698, 1)
y_train.shape is (3698, 50)
X_test.shape is (925, 1)
y_test.shape is (925, 50)


### Checking quality on train data

In [6]:
from catboost import CatBoostClassifier

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3698 entries, 1538 to 860
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3698 non-null   object
dtypes: object(1)
memory usage: 57.8+ KB


In [90]:
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 3), )

vectorizer.fit(X_train['text'])

X_tr_1 = vectorizer.transform(X_train['text'])
X_te_1 = vectorizer.transform(X_test['text'])

In [91]:
from catboost import Pool

train_pool = Pool(X_tr_1, y_train)
test_pool = Pool(X_te_1, y_test)

In [93]:
clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    eval_metric='Accuracy',
    class_names=[f"trend_id_res{i}" for i in range(50)],
    iterations=500
)
clf.fit(train_pool, eval_set=test_pool, metric_period=1, plot=True, verbose=50)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.059176
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 3.75s	remaining: 31m 11s
50:	learn: 0.1441320	test: 0.1556757	best: 0.1556757 (50)	total: 3m 9s	remaining: 27m 49s
100:	learn: 0.2495944	test: 0.2518919	best: 0.2518919 (98)	total: 6m 14s	remaining: 24m 39s
150:	learn: 0.3547864	test: 0.3048649	best: 0.3048649 (150)	total: 9m 16s	remaining: 21m 27s
200:	learn: 0.4413196	test: 0.3437838	best: 0.3437838 (199)	total: 12m 22s	remaining: 18m 23s
250:	learn: 0.5205516	test: 0.3654054	best: 0.3654054 (249)	total: 15m 28s	remaining: 15m 21s
300:	learn: 0.5781504	test: 0.3762162	best: 0.3805405 (294)	total: 18m 34s	remaining: 12m 16s
350:	learn: 0.6308816	test: 0.3891892	best: 0.3891892 (350)	total: 21m 39s	remaining: 9m 11s
400:	learn: 0.6730665	test: 0.3989189	best: 0.3989189 (398)	total: 24m 47s	remaining: 6m 7s
450:	learn: 0.7184965	test: 0.4032432	best: 0.4043243 (449)	total: 27m 55s	remaining: 3m 2s
499:	learn: 0.7536506	test: 0.4043243	best: 0.40

<catboost.core.CatBoostClassifier at 0x7e9363fc29b0>

In [67]:
y_pred = clf.predict(X_tr_1)

In [68]:
accuracy_score(y_train, y_pred)

0.5229853975121688

### Training the final model

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer

full_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 3), )

X_al = vectorizer.fit_transform(X['text'])

In [95]:
from catboost import Pool

pool = Pool(X_al, y)

In [97]:
clf = CatBoostClassifier(
    loss_function='MultiLogloss',
    eval_metric='Accuracy',
    class_names=[f"trend_id_res{i}" for i in range(50)]
)
clf.fit(pool, metric_period=1, plot=True, verbose=50)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.019809
0:	learn: 0.0006489	total: 4.67s	remaining: 1h 17m 43s


: 

### Preddiction and downloading the solution

In [80]:
test = pd.read_csv(DATA_DIR / "test.csv")

In [81]:
test.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text
0,1,3135,3.0,{DELIVERY},"Последнее время думаю плохо, сроки доставки да..."
1,3,4655,2.0,"{PRICE,DELIVERY,ASSORTMENT}",Цены намного выше магазинных но радуют акции
2,5,22118,2.0,"{CATALOG_NAVIGATION,ASSORTMENT,DELIVERY}","Доставка за [NUM] минут, заказ даже не начали ..."
3,7,23511,0.0,{DELIVERY},Ужасно долгая доставка
4,8,45,6.0,"{ASSORTMENT,PROMOTIONS}",Добрый вечер! Вы большие молодцы. Меня всё уст...


In [82]:
test_ve = vectorizer.transform(test['text'].fillna(''))

In [83]:
pred_test = clf.predict(test_ve)

In [84]:
res = pd.DataFrame(
    np.hstack([test["index"].values.reshape(test.shape[0], 1), pred_test]),
    columns=["index"]+[f"trend_id_res{i}" for i in range(50)])

In [85]:
res.head()

Unnamed: 0,index,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,trend_id_res6,trend_id_res7,trend_id_res8,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
target = ""
for i in range(50):
    def add_str(val: int) -> str:
        return "" if val == 0 else " " + str(i)
    
    target += res[f'trend_id_res{i}'].apply(add_str)

In [87]:
target = target.apply(str.strip)

In [88]:
res['target'] = target

In [89]:
res[['index', 'target']].to_csv("submission4.csv", index=False)