## Импорты


In [166]:
from datetime import datetime
from functools import partial, reduce
from itertools import product
from json import load as json_load
from multiprocessing import cpu_count
from operator import add
from pickle import dump as pickle_dump
from pickle import load as pickle_load
from timeit import default_timer as timer

from catboost import CatBoostClassifier, Pool
from numpy import array, int64, hstack as np_stack, logspace, mean, ndarray, vstack, zeros
from pandas import concat, DataFrame, read_csv
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from tabulate import tabulate
from tqdm.notebook import tqdm
from transliterate import translit

In [2]:
%%time
df_train = read_csv("../data/train_preprocessed.csv")
df_test = read_csv("../data/val_preprocessed.csv")

# df_train['description'] = df_train['description'].replace(r'[\W_]+', ' ', regex = True).str.lower()
# df_val['description'] = df_val['description'].replace(r'[\W_]+', ' ', regex = True).str.lower()
y_test = df_test.is_bad
y_train = df_train.is_bad

df_train = df_train.loc[:, df_train.columns != 'is_bad']
df_test = df_test.loc[:, df_test.columns != 'is_bad']

CPU times: user 6.36 s, sys: 369 ms, total: 6.73 s
Wall time: 6.95 s


## Делим трейн на трейн и вал

In [3]:
df_train, df_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.05, random_state=2021)

## Преобразование текста в TF-IDF...
...в надежде что количество оборотов, указывающих на наличие контактов, сильно ограничено, и его можно как-то удобно выделить с относительно небольшим словарём. Если выгорит, можно будет потом проводить кластерный анализ и остальные приколы векторных моделей.

In [4]:
text_transformer = TfidfVectorizer(min_df=1)

In [5]:
with open('../lib/text_transformer.pickle', 'rb') as f:
    text_transformer = pickle_load(f)
    f.close()

In [6]:
%%time
# X_train_text = text_transformer.fit_transform(df_train['title_and_description'])
X_train_text = text_transformer.transform(df_train['title_and_description'])
X_test_text = text_transformer.transform(df_test['title_and_description'])
X_val_text = text_transformer.transform(df_val['title_and_description'])

CPU times: user 27.6 s, sys: 160 ms, total: 27.7 s
Wall time: 27.7 s


Ужасно, почему нельзя зафиттить его с параллелизацией?

In [7]:
with open('../lib/text_transformer.pickle', 'wb') as f:
    pickle_dump(text_transformer, f)
    f.close()

## Векторизуем категории
В общем-то тут можно использовать два энкодера, но зачем усложнять, если всё за нас придумали?

In [6]:
# with open("../lib/cat_transformer.pickle", "rb") as f:
#     vectorizer = pickle_load(f)
#     f.close()

In [8]:
categories = ['subcategory', 'category' , 'region', 'city']

## Регулярочки. Довольно круто бустят модель!
Вообще-то список можно посмотреть в файлике, там есть интересные регулярки на номер телефона формата 8п9а0к5о5р7в0д9г6а4т7ь и ещё куча всякой всячины.\
Не зря же я их придумывал...

In [8]:
# %%time


# vectorizer = DictVectorizer(sparse=False)
# df_transformed = concat([df_train[categories], df_test[categories], df_val[categories]])
# df_transformed = vectorizer.fit_transform(df_transformed.to_dict('records'))

# X_train_categ = df_transformed[:X_train_text.shape[0]]
# X_test_categ = df_transformed[X_train_text.shape[0]:-X_val_text.shape[0]]
# X_val_categ = df_transformed[-X_val_text.shape[0]:]

CPU times: user 4.49 s, sys: 1.78 s, total: 6.27 s
Wall time: 6.28 s


In [9]:
# with open("../lib/cat_transformer.pickle", "wb") as f:
#     pickle_dump(vectorizer, f)
#     f.close()

In [10]:
# del df_transformed

In [11]:
# !free -mh

              total        used        free      shared  buff/cache   available
Mem:           31Gi        10Gi        19Gi       0,0Ki       1,3Gi        20Gi
Swap:         2,0Gi       1,2Gi       767Mi


In [9]:
%%time
with open('../lib/models/regexps/regexp.json') as json_file:
    regexps = json_load(json_file)
    json_file.close()

CPU times: user 294 µs, sys: 7 µs, total: 301 µs
Wall time: 367 µs


In [10]:
regexp_names = list(product(regexps.keys(), ['_description', '_title']))
add_reduce = partial(reduce, add)
regexp_names = list(map(add_reduce, regexp_names))

# Обучаем модели

## Обучаем логрегрессию
Я сделал логистическую регрессию для каждой категории. Вот что получилось:

In [21]:
%%time
%%capture output

logreg_grid = {
    "C": [3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7],
    "fit_intercept": [True, False],
}

LogReg = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=2021, max_iter=100)
grid_cv_logreg = GridSearchCV(LogReg, logreg_grid, cv=3, scoring='roc_auc', verbose=5, n_jobs=cpu_count())

grid_cv_logreg.fit(X_train, y_train)

  and should_run_async(code)


CPU times: user 8min 33s, sys: 6.7 s, total: 8min 40s
Wall time: 25min 47s


Эта штука очень долго работала, внизу выписал лучший результат

In [23]:
grid_cv_logreg.best_params_

  and should_run_async(code)


{'C': 3.9, 'fit_intercept': False}

In [15]:
# X_train_regexp = df_train[regexp_names]
# X_test_regexp = df_test[regexp_names]
# X_val_regexp = df_val[regexp_names]

## Не забудем и про остальные значения

In [16]:
# X_train_others = df_train[['time', 'price']].values
# X_test_others = df_test[['time', 'price']].values
# X_val_others = df_val[['time', 'price']].values

In [13]:
with open('../lib/models/logreg.pickle', 'rb') as f:
    logreg = pickle_load(f)
    f.close()

In [11]:
logreg_params = {
        "C": 3.9,
        "fit_intercept": False,
        "multi_class": 'multinomial',
        "random_state": 2021,
        "max_iter": 100,
        "n_jobs": cpu_count(),
    }

Сохраним наши логистические регрессии

In [12]:
logreg = LogisticRegression(**logreg_params)
logreg

LogisticRegression(C=3.9, fit_intercept=False, multi_class='multinomial',
                   n_jobs=16, random_state=2021)

In [29]:
%%time
logreg.fit(X_train_text, y_train)

CPU times: user 88.7 ms, sys: 775 ms, total: 864 ms
Wall time: 1min 9s


LogisticRegression(C=3.9, fit_intercept=False, multi_class='multinomial',
                   n_jobs=16, random_state=2021)

In [42]:
with open('../lib/models/logreg.pickle', 'wb') as f:
    pickle_dump(logreg, f)
    f.close()

Код для загрузки моделек с ужасным comprehension

Посмотрим на метрички на датасетах, полученных из тренировочного

In [14]:
def metrics_printer(X_train, X_val, y_train, y_val, df_train, df_val, model, dataset_types):
    overall_table = []
    cats_table = []
    for dataset, ds_type, labels, df in zip([X_train, X_val], dataset_types, [y_train, y_val], [df_train, df_val]):
        pred_probas = zeros(labels.shape)
        labels_pred = zeros(labels.shape)
        for category in df["category"].unique():
            cat_name = translit(category.lower().replace(" ", "_"), 'ru', reversed=True)
            pred_probas[df["category"] == category] = model.predict_proba(dataset[df["category"] == category])[:, 1]
            labels_pred[df["category"] == category] = model.predict(dataset[df["category"] == category])
            rocauc_category = roc_auc_score(labels[df["category"] == category], pred_probas[df["category"] == category])
            f1_category = f1_score(labels[df["category"] == category], labels_pred[df["category"] == category])
            accuracy_category = accuracy_score(labels[df["category"] == category], labels_pred[df["category"] == category])
            cats_table.append([cat_name, ds_type, rocauc_category, f1_category, accuracy_category])
        rocauc = roc_auc_score(labels, pred_probas)
        f1 = f1_score(labels, labels_pred)
        accuracy = accuracy_score(labels, labels_pred)
        overall_table.append([ds_type, rocauc, f1, accuracy])
    print("Categories table:")
    print(tabulate(cats_table, headers=["Category", "Type", "AUC", 'f1', 'accuracy'], tablefmt='orgtbl'))
    print(" _____________________________________________")
    print(".____________________________________________.>")
    print("Overall table:")
    print(tabulate(overall_table, headers=['Dataset', 'AUC', 'f1', 'accuracy'], tablefmt='orgtbl'))

In [15]:
metrics_printer(X_train_text, X_val_text, y_train, y_val, df_train, df_val, logreg, ['train', 'val'])

Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| nedvizhimost'        | train  | 0.978165 | 0.915131 |   0.936195 |
| bytovaja_elektronika | train  | 0.974727 | 0.831686 |   0.956036 |
| transport            | train  | 0.985856 | 0.914734 |   0.955413 |
| zhivotnye            | train  | 0.976538 | 0.935894 |   0.935086 |
| rabota               | train  | 0.947191 | 0.828981 |   0.890272 |
| lichnye_veschi       | train  | 0.963465 | 0.803799 |   0.952854 |
| dlja_doma_i_dachi    | train  | 0.970043 | 0.863782 |   0.944144 |
| hobbi_i_otdyh        | train  | 0.969774 | 0.832887 |   0.948138 |
| uslugi               | train  | 0.951537 | 0.878637 |   0.891324 |
| dlja_biznesa         | train  | 0.942121 | 0.729667 |   0.931157 |
| transport            | val    | 0.955289 | 0.815977 |   0.90864  |
| nedvizhimost'        | val    | 0.943069 | 0.843215 |   0.887789 |
| dlja_doma_i_da

## Обучим наивный Байес

In [39]:
%%time
parameters_nb = {'alpha': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]}
NaiveBayes = MultinomialNB()
grid_cv_nb = GridSearchCV(NaiveBayes, parameters_nb, cv=3, scoring='roc_auc', verbose=5, n_jobs=cpu_count())
grid_cv_nb.fit(X_train, y_train)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


  and should_run_async(code)


CPU times: user 445 ms, sys: 616 ms, total: 1.06 s
Wall time: 5.9 s


GridSearchCVProgressBar(cv=3, estimator=MultinomialNB(), n_jobs=16,
                        param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.0001,
                                              1e-05, 1e-06]},
                        scoring='roc_auc', verbose=5)

In [46]:
grid_cv_nb.best_params_

{'alpha': 0.01}

In [32]:
nb_params = {'alpha': 0.01}

In [33]:
naive_bayes = MultinomialNB(**nb_params)
naive_bayes.fit(X_train_text, y_train)

MultinomialNB(alpha=0.01)

In [41]:
with open('../lib/models/naive_bayes.pickle', 'wb') as f:
    pickle_dump(naive_bayes, f)
    f.close()

In [16]:
with open('../lib/models/naive_bayes.pickle', 'rb') as f:
    naive_bayes = pickle_load(f)
    f.close()

In [35]:
metrics_printer(X_train_text, X_val_text, y_train, y_val, df_train, df_val, naive_bayes, ['train', 'val'])

Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| nedvizhimost'        | train  | 0.971003 | 0.817283 |   0.831864 |
| bytovaja_elektronika | train  | 0.960092 | 0.797805 |   0.952038 |
| transport            | train  | 0.983468 | 0.895895 |   0.943744 |
| zhivotnye            | train  | 0.964057 | 0.802991 |   0.751323 |
| rabota               | train  | 0.923818 | 0.784402 |   0.847083 |
| lichnye_veschi       | train  | 0.939711 | 0.75886  |   0.946201 |
| dlja_doma_i_dachi    | train  | 0.959698 | 0.848667 |   0.939028 |
| hobbi_i_otdyh        | train  | 0.965777 | 0.839153 |   0.950992 |
| uslugi               | train  | 0.935246 | 0.846308 |   0.854385 |
| dlja_biznesa         | train  | 0.94959  | 0.739379 |   0.937075 |
| transport            | val    | 0.909726 | 0.672708 |   0.84652  |
| nedvizhimost'        | val    | 0.872301 | 0.728861 |   0.751297 |
| dlja_doma_i_da

## Обучим Random Forest над SVD
Ну и т.к. деревья плохо работают с разреженными матрицами, сделаем SVD-разложение над TF-IDF.

In [20]:
%%time
svd = TruncatedSVD(n_components=100)
X_svd = svd.fit_transform(X_train_text)

CPU times: user 3min, sys: 35 s, total: 3min 35s
Wall time: 1min 21s


In [None]:
random_forest = RandomForestClassifier()
param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [40, 50, 60],
}
rf_search = GridSearchCV(random_forest, param_grid, scoring='roc_auc', n_jobs=cpu_count(), verbose=10)
rf_search.fit(X_svd, y_train)
print("Best parameter (CV score=%0.3f):" % rf_search.best_score_)
print(rf_search.best_params_)

In [None]:
rf_search.best_params_

In [24]:
best_params = {
    'random_forest__max_depth': 40,
    'random_forest__n_estimators': 250,
    'random_forest__n_jobs': cpu_count(),
}

In [25]:
random_forest = RandomForestClassifier()
rf_pipeline = Pipeline(steps=[('svd', svd), ('random_forest', random_forest)])
rf_pipeline.set_params(**best_params)

Pipeline(steps=[('svd', TruncatedSVD(n_components=100)),
                ('random_forest',
                 RandomForestClassifier(max_depth=40, n_estimators=250,
                                        n_jobs=16))])

In [26]:
%%time
rf_pipeline.fit(X_train_text, y_train)

CPU times: user 1h 12min 42s, sys: 30 s, total: 1h 13min 12s
Wall time: 5min 45s


Pipeline(steps=[('svd', TruncatedSVD(n_components=100)),
                ('random_forest',
                 RandomForestClassifier(max_depth=40, n_estimators=250,
                                        n_jobs=16))])

In [32]:
with open('../lib/models/random_forest.pickle', 'wb') as f:
    pickle_dump(rf_pipeline, f)
    f.close()

In [31]:
with open("../lib/models/random_forest.pickle", "rb") as f:
    rf_pipeline = pickle_load(f)
    f.close()

## Посмотрим на метрички моделей

In [46]:
print("NAIVE BAYES BY CATEGORY")
metrics_printer(X_test_text, X_val_text, y_test, y_val, df_test, df_val, naive_bayes, ['test', 'val'])

NAIVE BAYES BY CATEGORY
Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.955143 | 0.680882 |   0.842096 |
| dlja_biznesa         | test   | 0.851259 | 0.390244 |   0.913495 |
| dlja_doma_i_dachi    | test   | 0.890658 | 0.536204 |   0.876434 |
| lichnye_veschi       | test   | 0.744132 | 0.252874 |   0.889518 |
| uslugi               | test   | 0.715488 | 0.559292 |   0.629464 |
| bytovaja_elektronika | test   | 0.891001 | 0.282158 |   0.937948 |
| nedvizhimost'        | test   | 0.810619 | 0.694633 |   0.71827  |
| hobbi_i_otdyh        | test   | 0.810228 | 0.377358 |   0.917189 |
| rabota               | test   | 0.678244 | 0.59887  |   0.608456 |
| zhivotnye            | test   | 0.698594 | 0.714829 |   0.642857 |
| transport            | val    | 0.909726 | 0.672708 |   0.84652  |
| nedvizhimost'        | val    | 0.872301 | 0.728861 |   0.7

In [47]:
print("LOGISTIC REGRESSION BY CATEGORY")
metrics_printer(X_test_text, X_val_text, y_test, y_val, df_test, df_val, logreg, ['test', 'val'])

LOGISTIC REGRESSION BY CATEGORY
Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.967385 | 0.800268 |   0.891395 |
| dlja_biznesa         | test   | 0.845865 | 0.561404 |   0.913495 |
| dlja_doma_i_dachi    | test   | 0.899725 | 0.68652  |   0.895725 |
| lichnye_veschi       | test   | 0.853649 | 0.532934 |   0.911615 |
| uslugi               | test   | 0.855622 | 0.734043 |   0.776786 |
| bytovaja_elektronika | test   | 0.917485 | 0.474576 |   0.933286 |
| nedvizhimost'        | test   | 0.877664 | 0.685157 |   0.760956 |
| hobbi_i_otdyh        | test   | 0.847091 | 0.430556 |   0.897114 |
| rabota               | test   | 0.823418 | 0.711864 |   0.75     |
| zhivotnye            | test   | 0.844172 | 0.753488 |   0.747619 |
| transport            | val    | 0.955289 | 0.815977 |   0.90864  |
| nedvizhimost'        | val    | 0.943069 | 0.843215

In [48]:
print("RANDOM FOREST BY CATEGORY")
metrics_printer(X_test_text, X_val_text, y_test, y_val, df_test, df_val, rf_pipeline, ['test', 'val'])

RANDOM FOREST BY CATEGORY
Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.95466  | 0.704587 |   0.853556 |
| dlja_biznesa         | test   | 0.803776 | 0.578947 |   0.944637 |
| dlja_doma_i_dachi    | test   | 0.863622 | 0.587121 |   0.88634  |
| lichnye_veschi       | test   | 0.774427 | 0.290598 |   0.905949 |
| uslugi               | test   | 0.79325  | 0.543933 |   0.675595 |
| bytovaja_elektronika | test   | 0.853378 | 0.322314 |   0.941176 |
| nedvizhimost'        | test   | 0.715688 | 0.517155 |   0.671599 |
| hobbi_i_otdyh        | test   | 0.817962 | 0.465116 |   0.942284 |
| rabota               | test   | 0.718336 | 0.518337 |   0.637868 |
| zhivotnye            | test   | 0.819819 | 0.706468 |   0.719048 |
| transport            | val    | 0.935876 | 0.727313 |   0.876872 |
| nedvizhimost'        | val    | 0.892311 | 0.732906 |   0

Уже неплохо, но бейзлайн не все модельки бьют

## Stacking
Давайте-ка сверху накатим катбуст и дадим ему наш предобработанный массив: у нас есть данные про время, цену, результат применения регулярных выражений и категориях

In [49]:
prediction_cols = ['logreg', 'naive_bayes', 'random_forest']
columns_no_text = ['time', 'price'] + categories + regexp_names + prediction_cols

In [33]:
%%time
logreg_probas_train = logreg.predict_proba(X_train_text)
nb_probas_train = naive_bayes.predict_proba(X_train_text)
rf_probas_train = rf_pipeline.predict_proba(X_train_text)

df_train[prediction_cols] = vstack([item[:,1].T for item in [logreg_probas_train, nb_probas_train, rf_probas_train]]).T
X_train_no_text = df_train[columns_no_text]

CPU times: user 2min 55s, sys: 504 ms, total: 2min 56s
Wall time: 14 s


In [85]:
logreg_probas_test = logreg.predict_proba(X_test_text)
nb_probas_test = naive_bayes.predict_proba(X_test_text)
rf_probas_test = rf_pipeline.predict_proba(X_test_text)

df_test[prediction_cols] = vstack([item[:,1].T for item in [logreg_probas_test, nb_probas_test, rf_probas_test]]).T
X_test_no_text = df_test[columns_no_text]

In [86]:
logreg_probas_val = logreg.predict_proba(X_val_text)
nb_probas_val = naive_bayes.predict_proba(X_val_text)
rf_probas_val = rf_pipeline.predict_proba(X_val_text)

df_val[prediction_cols] = vstack([item[:,1].T for item in [logreg_probas_val, nb_probas_val, rf_probas_val]]).T
X_val_no_text = df_val[columns_no_text]

In [110]:
cols_to_drop = ['time', 'random_forest', 'impulse_description', 'subcategory', 'phone_description', 'category', 'region', 'social_media_description']
cols_to_drop_2 = ['price', 'site_description', 'phone_normal_title', 'social_media_title', 'home_phone_title', 'phone_title', 'phone_operators_title', 'phone_biased_title', 'youtube_title']
cols_to_drop_3 = ['city', 'site_title', 'email_title']
cols_to_drop.extend(cols_to_drop_2, cols_to_drop_3)

categories_cb = set(categories).difference(cols_to_drop)

X_train_no_text = X_train_no_text.drop(cols_to_drop, axis=1)
X_test_no_text = X_test_no_text.drop(cols_to_drop, axis=1)
X_val_no_text = X_val_no_text.drop(cols_to_drop, axis=1)

In [189]:
cols_to_drop = ['time', 'random_forest', 'impulse_description', 'subcategory', 'phone_description', 'category', 'region', 'social_media_description']
cols_to_drop_2 = ['price', 'site_description', 'phone_normal_title', 'social_media_title', 'home_phone_title', 'phone_title', 'phone_operators_title', 'phone_biased_title', 'youtube_title']
cols_to_drop_3 = ['city', 'site_title', 'email_title']
cols_to_drop.extend(cols_to_drop_2)
cols_to_drop.extend(cols_to_drop_3)

categories_cb = set(categories).difference(cols_to_drop)

### Тут я повыкидывал фичи
Выкинул все с отрицательным `feature_importance`, после чего модель стала работать заметно лучше

In [132]:
train_pool = Pool(data=X_train_no_text, label=y_train, cat_features=categories_cb)
test_pool = Pool(data=X_test_no_text, label=y_test, cat_features=categories_cb)
val_pool = Pool(data=X_val_no_text, label=y_val, cat_features=categories_cb)

In [190]:
train_pool_text = Pool(data=df_train.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), label=y_train, text_features=['title_and_description'])
test_pool_text = Pool(data=df_test.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), label=y_test, text_features=['title_and_description'])
val_pool_text = Pool(data=df_val.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), label=y_val, text_features=['title_and_description'])

In [196]:
catboost = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    random_strength=0.01,
    depth=8,
    l2_leaf_reg=6,
    eval_metric='AUC',
#     leaf_estimation_method='Newton',
    verbose=0,
    task_type='GPU',
    devices='0',
)

In [None]:
grid = {
    'learning_rate': [0.01, 0.03, 0.07, 0.1],
    'l2_leaf_reg': [10, 15],
}

grid_search_result = catboost.grid_search(
    grid, X=train_pool, verbose=1, cv=5, plot=True,
)

In [203]:
catboost = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    random_strength=0.01,
    depth=8,
    l2_leaf_reg=15,
    eval_metric='AUC',
    verbose=100,
    task_type='CPU',
    thread_count=cpu_count(),
)
catboost.fit(train_pool_text, plot=True, eval_set=val_pool_text)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.9146290	best: 0.9146290 (0)	total: 280ms	remaining: 4m 39s
100:	test: 0.9593407	best: 0.9593407 (100)	total: 26s	remaining: 3m 51s
200:	test: 0.9640006	best: 0.9640006 (200)	total: 50.9s	remaining: 3m 22s
300:	test: 0.9662385	best: 0.9662385 (300)	total: 1m 14s	remaining: 2m 54s
400:	test: 0.9676950	best: 0.9676950 (400)	total: 1m 38s	remaining: 2m 27s
500:	test: 0.9687266	best: 0.9687266 (500)	total: 2m 2s	remaining: 2m 2s
600:	test: 0.9694828	best: 0.9694828 (600)	total: 2m 25s	remaining: 1m 36s
700:	test: 0.9700675	best: 0.9700675 (700)	total: 2m 49s	remaining: 1m 12s
800:	test: 0.9706713	best: 0.9706713 (800)	total: 3m 12s	remaining: 47.7s
900:	test: 0.9710795	best: 0.9710795 (900)	total: 3m 35s	remaining: 23.6s
999:	test: 0.9713814	best: 0.9713832 (998)	total: 3m 58s	remaining: 0us

bestTest = 0.9713832082
bestIteration = 998

Shrink model to first 999 iterations.


<catboost.core.CatBoostClassifier at 0x7f97f4b81340>

In [135]:
metrics_printer(X_test_no_text, X_val_no_text, y_test, y_val, df_test, df_val, catboost, ['test', 'val'])

Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.974439 | 0.856961 |   0.918137 |
| dlja_biznesa         | test   | 0.81252  | 0.545455 |   0.913495 |
| dlja_doma_i_dachi    | test   | 0.900591 | 0.695522 |   0.893639 |
| lichnye_veschi       | test   | 0.819764 | 0.523077 |   0.912181 |
| uslugi               | test   | 0.815251 | 0.729412 |   0.760417 |
| bytovaja_elektronika | test   | 0.919574 | 0.423313 |   0.932568 |
| nedvizhimost'        | test   | 0.890941 | 0.79082  |   0.818441 |
| hobbi_i_otdyh        | test   | 0.827621 | 0.425    |   0.884567 |
| rabota               | test   | 0.865822 | 0.739958 |   0.773897 |
| zhivotnye            | test   | 0.88263  | 0.825112 |   0.814286 |
| transport            | val    | 0.97233  | 0.861325 |   0.929077 |
| nedvizhimost'        | val    | 0.963742 | 0.889095 |   0.91537  |
| dlja_doma_i_da

In [192]:
metrics_printer(df_test.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), df_val.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), y_test, y_val, df_test, df_val, catboost, ['test', 'val'])

Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.982713 | 0.894915 |   0.937966 |
| dlja_biznesa         | test   | 0.874959 | 0.586207 |   0.916955 |
| dlja_doma_i_dachi    | test   | 0.924983 | 0.779911 |   0.922315 |
| lichnye_veschi       | test   | 0.831717 | 0.614251 |   0.911048 |
| uslugi               | test   | 0.847534 | 0.752166 |   0.787202 |
| bytovaja_elektronika | test   | 0.941605 | 0.56213  |   0.946915 |
| nedvizhimost'        | test   | 0.902552 | 0.743484 |   0.787137 |
| hobbi_i_otdyh        | test   | 0.903458 | 0.484848 |   0.89335  |
| rabota               | test   | 0.856772 | 0.719457 |   0.772059 |
| zhivotnye            | test   | 0.910113 | 0.833333 |   0.838095 |
| transport            | val    | 0.980878 | 0.898255 |   0.947491 |
| nedvizhimost'        | val    | 0.975228 | 0.910998 |   0.933993 |
| dlja_doma_i_da

In [204]:
metrics_printer(df_test.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), df_val.drop(set(categories).union(cols_to_drop), axis=1).drop(['logreg', 'naive_bayes'], axis=1), y_test, y_val, df_test, df_val, catboost, ['test', 'val'])

Categories table:
| Category             | Type   |      AUC |       f1 |   accuracy |
|----------------------+--------+----------+----------+------------|
| transport            | test   | 0.982273 | 0.895256 |   0.938148 |
| dlja_biznesa         | test   | 0.86744  | 0.551724 |   0.910035 |
| dlja_doma_i_dachi    | test   | 0.925065 | 0.771723 |   0.919187 |
| lichnye_veschi       | test   | 0.830585 | 0.607843 |   0.909348 |
| uslugi               | test   | 0.846278 | 0.759099 |   0.793155 |
| bytovaja_elektronika | test   | 0.940853 | 0.56305  |   0.946557 |
| nedvizhimost'        | test   | 0.903055 | 0.746228 |   0.789414 |
| hobbi_i_otdyh        | test   | 0.904997 | 0.494118 |   0.892095 |
| rabota               | test   | 0.858516 | 0.725624 |   0.777574 |
| zhivotnye            | test   | 0.911111 | 0.829268 |   0.833333 |
| transport            | val    | 0.980883 | 0.900216 |   0.948503 |
| nedvizhimost'        | val    | 0.975121 | 0.911746 |   0.934465 |
| dlja_doma_i_da

In [205]:
catboost.get_feature_importance(val_pool_text, type='LossFunctionChange', prettified=True)

Unnamed: 0,Feature Id,Importances
0,title_and_description,0.247596
1,phone_normal_description,0.016012
2,home_phone_description,0.014952
3,phone_biased_description,0.004366
4,email_description,0.000818
5,youtube_description,0.000746
6,phone_operators_description,6.8e-05


## Вывод
В общем, я там сделал стекинг катбуста над тремя модельками, но он не полетел. А вот катбуст просто на лемматизации — полетел. Теперь сижу и грущу, что две недели на это угрохал.

In [206]:
catboost.save_model('../lib/models/catboost_classifier.cbm')

In [208]:
catboost = CatBoostClassifier().load_model('../lib/models/catboost_classifier.cbm')