# Чистый ноутбук для тренировки катбуста
Оставил только тренировку и вывод метричек

In [20]:
from json import load as json_load
from time import time

from bayes_opt import BayesianOptimization, SequentialDomainReductionTransformer
from bayes_opt.event import Events
from bayes_opt.logger import JSONLogger
from catboost import CatBoostClassifier, Pool
from numpy import zeros
from pandas import DataFrame, read_csv
from pandarallel import pandarallel
from re import sub
from sklearn.metrics import accuracy_score, f1_score, make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from tabulate import tabulate
from transliterate import translit

pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [21]:
%%time
df_train = read_csv("../data/train_preprocessed.csv")
df_test = read_csv("../data/val_preprocessed.csv")

# df_train['description'] = df_train['description'].replace(r'[\W_]+', ' ', regex = True).str.lower()
# df_val['description'] = df_val['description'].replace(r'[\W_]+', ' ', regex = True).str.lower()
y_test = df_test.is_bad
y_train = df_train.is_bad

df_train = df_train.loc[:, df_train.columns != 'is_bad']
df_test = df_test.loc[:, df_test.columns != 'is_bad']

CPU times: user 7.36 s, sys: 352 ms, total: 7.71 s
Wall time: 7.71 s


In [22]:
df_train, df_val, y_train, y_val = train_test_split(df_train, y_train, test_size=0.01, random_state=2021, stratify=df_train['category'])

In [23]:
%%time
with open('../lib/models/regexps/regexp.json') as json_file:
    regexps = json_load(json_file)
    json_file.close()

CPU times: user 463 µs, sys: 0 ns, total: 463 µs
Wall time: 244 µs


In [24]:
regexp_names = regexps.keys()

Тут я не удаляю категории сразу, чтобы потом вывести метрички по ним

In [30]:
train_pool_text = Pool(data=df_train.drop(['category'], axis=1), label=y_train, text_features=['title_and_description', 'text', 'numbers'])
val_pool_text = Pool(data=df_val.drop(['category'], axis=1), label=y_val, text_features=['title_and_description', 'text', 'numbers'])
test_pool_text = Pool(data=df_test.drop(['category'], axis=1), label=y_test, text_features=['title_and_description', 'text', 'numbers'])

In [3]:
catboost = CatBoostClassifier().load_model('../lib/models/catboost_classifier.cbm')

## Оптимизация гиперпараметров
Напишем функции для Байесовской оптимизации

In [132]:
def get_model(**params):
    return CatBoostClassifier(**params)

def validate(model, X_train, y_train, X_val, y_val, verbose=0):
    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
#         plot=verbose,
        plot=False
    )
    print(model.get_params())
    probs = model.predict_proba(X_val)[:,1]
    current_score = roc_auc_score(y_val, probs)
    if verbose:
        print('score: ', current_score, '\n')
        
    return current_score

def evaluate_model(**params):
    params_init = {
        'iterations': 1000, 
        'verbose': 0,
        'use_best_model': True,
        'early_stopping_rounds': 100,
        'loss_function': 'Logloss',
        'task_type': 'GPU',
        'text_features': ['title_and_description', 'filtered_text', 'filtered_numbers'],
        'bagging_temperature': 0,
    }
    params_init.update(params)
    params_init['depth'] = int(round(params_init['depth']))
#     params_init['bagging_temperature'] = int(round(params_init['bagging_temperature']))
    model = get_model(**params_init)
    current_score = validate(model, X_train, y_train, X_val, y_val, verbose=1)
    return current_score

In [86]:
X_train = df_train.drop(['category', 'phone_normal', 'youtube'], axis=1)
X_val = df_val.drop(['category', 'phone_normal', 'youtube'], axis=1)

In [84]:
pbounds = {
    'depth': (4, 12),
    'l2_leaf_reg': (1e-3, 1e2),
    'random_strength': (1e-2, 10),
    'bagging_temperature': (0, 10),
}

Сначала поищем минимум глобально, для этого пока уберём Domain Reduction

In [None]:
# bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(evaluate_model, pbounds, random_state=4)#, bounds_transformer=bounds_transformer)

logger = JSONLogger(path="bayesian_search_logs/logs.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=2, n_iter=50, kappa=5)

optimizer.res
# optimized params one can find in ./logs.json

In [118]:
optimizer.max

{'target': 0.9746248876329148,
 'params': {'bagging_temperature': 0.0,
  'depth': 12.0,
  'l2_leaf_reg': 5.4609410259597775,
  'random_strength': 0.01}}

Теперь будем искать в окрестностях найденного минимума, используя Domain Reduction

In [None]:
pbounds_2 = {
    'depth': (11, 13),
    'l2_leaf_reg': (1e-3, 1e1),
    'random_strength': (1e-2, 10),
    'learning_rate': (1e-3, 1e-1),
}

bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(evaluate_model, pbounds_2, random_state=4, bounds_transformer=bounds_transformer)

logger = JSONLogger(path="bayesian_search_logs/logs_2.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=4, n_iter=30, kappa=2.0)

optimizer.res
# optimized params one can find in ./logs.json

In [135]:
optimizer.max

{'target': 0.978803933112792,
 'params': {'depth': 12.934059678027353,
  'l2_leaf_reg': 5.472775259508048,
  'learning_rate': 0.09729575163652356,
  'random_strength': 7.151011776806903}}

## Обучение модели

In [31]:
catboost = CatBoostClassifier(
    iterations=1500,
    bagging_temperature=0.0,
    learning_rate=0.09729575163652356,
    random_strength=7.151011776806903,
    depth=13,
    l2_leaf_reg=5.472775259508048,
    eval_metric='AUC',
    verbose=500,
    task_type='GPU',
#     thread_count=-1,
)
catboost.fit(train_pool_text, eval_set=val_pool_text, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9363608	test: 0.9406977	best: 0.9406977 (0)	total: 487ms	remaining: 12m 9s
500:	learn: 0.9818082	test: 0.9781525	best: 0.9781525 (500)	total: 3m 24s	remaining: 6m 48s
1000:	learn: 0.9866583	test: 0.9801734	best: 0.9801981 (992)	total: 6m 45s	remaining: 3m 22s
1499:	learn: 0.9890984	test: 0.9810519	best: 0.9810859 (1478)	total: 10m 2s	remaining: 0us
bestTest = 0.9810858965
bestIteration = 1478
Shrink model to first 1479 iterations.


<catboost.core.CatBoostClassifier at 0x7f3cd83a7160>

## Вывод метрик

In [27]:
def category_metrics_getter(X, y, category, pred_probas, labels_pred, cats_table, model):
    cat_name = translit(category.lower().replace(" ", "_"), 'ru', reversed=True)
    
    pred_probas[X["category"] == category] = model.predict_proba(X.drop(['category'], axis=1)[X["category"] == category])[:, 1]
    labels_pred[X["category"] == category] = model.predict(X.drop(['category'], axis=1)[X["category"] == category])
    
    rocauc_category = roc_auc_score(y[X["category"] == category], pred_probas[X["category"] == category])
    f1_category = f1_score(y[X["category"] == category], labels_pred[X["category"] == category])
    accuracy_category = accuracy_score(y[X["category"] == category], labels_pred[X["category"] == category])
    
    cats_table.append([cat_name, rocauc_category, f1_category, accuracy_category])
    return pred_probas, labels_pred, cats_table

def metrics_printer(X, y, model):
    overall_table = []
    cats_table = []
    pred_probas = zeros(y.shape)
    labels_pred = zeros(y.shape)
    for category in X["category"].unique():
        pred_probas, labels_pred, cats_table = category_metrics_getter(
            X, y, category, pred_probas, labels_pred, cats_table, model,
        )
    rocauc = roc_auc_score(y, pred_probas)
    f1 = f1_score(y, labels_pred)
    accuracy = accuracy_score(y, labels_pred)
    overall_table.append([rocauc, f1, accuracy])
    print("Categories table:")
    print(tabulate(cats_table, headers=["Category", "AUC", 'f1', 'accuracy'], tablefmt='orgtbl') + '\n')
    print("Overall table:")
    print(tabulate(overall_table, headers=['AUC', 'f1', 'accuracy'], tablefmt='orgtbl'))

In [32]:
%%time
metrics_printer(df_test, y_test, catboost)

Categories table:
| Category             |      AUC |       f1 |   accuracy |
|----------------------+----------+----------+------------|
| transport            | 0.987587 | 0.90463  |   0.943424 |
| dlja_biznesa         | 0.886074 | 0.653061 |   0.941176 |
| dlja_doma_i_dachi    | 0.957238 | 0.830357 |   0.940563 |
| lichnye_veschi       | 0.840291 | 0.690355 |   0.930878 |
| uslugi               | 0.894389 | 0.814691 |   0.834821 |
| bytovaja_elektronika | 0.93794  | 0.59116  |   0.946915 |
| nedvizhimost'        | 0.952774 | 0.829236 |   0.853728 |
| hobbi_i_otdyh        | 0.924595 | 0.560976 |   0.909661 |
| rabota               | 0.917464 | 0.801706 |   0.829044 |
| zhivotnye            | 0.923628 | 0.851675 |   0.852381 |

Overall table:
|      AUC |       f1 |   accuracy |
|----------+----------+------------|
| 0.960391 | 0.833941 |   0.921414 |
CPU times: user 5.64 s, sys: 172 ms, total: 5.81 s
Wall time: 960 ms


In [30]:
%%time
metrics_printer(df_test, y_test, catboost)

Categories table:
| Category             |      AUC |       f1 |   accuracy |
|----------------------+----------+----------+------------|
| transport            | 0.988839 | 0.914286 |   0.948699 |
| dlja_biznesa         | 0.89964  | 0.666667 |   0.941176 |
| dlja_doma_i_dachi    | 0.952568 | 0.837838 |   0.943691 |
| lichnye_veschi       | 0.831012 | 0.663391 |   0.92238  |
| uslugi               | 0.894585 | 0.81198  |   0.831845 |
| bytovaja_elektronika | 0.944732 | 0.650138 |   0.954448 |
| nedvizhimost'        | 0.958663 | 0.828817 |   0.85259  |
| hobbi_i_otdyh        | 0.935211 | 0.594937 |   0.919699 |
| rabota               | 0.912598 | 0.807611 |   0.832721 |
| zhivotnye            | 0.940499 | 0.889952 |   0.890476 |

Overall table:
|      AUC |       f1 |   accuracy |
|----------+----------+------------|
| 0.962959 | 0.842064 |   0.924801 |
CPU times: user 5.45 s, sys: 120 ms, total: 5.57 s
Wall time: 974 ms


Отбор фичей

In [34]:
catboost.get_feature_importance(val_pool_text, type='LossFunctionChange', prettified=True)

Unnamed: 0,Feature Id,Importances
0,title_and_description,0.137108
1,home_phone,0.028413
2,numbers,0.027423
3,text,0.018337
4,email,0.001409
5,phone_biased,0.000693
6,phone_operators,0.000122


In [35]:
catboost.save_model('../lib/models/catboost_classifier.cbm')