## Подключаем необходимые библиотеки

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from time import time
import pandas as pd
import pickle

## Загружаем предобработанный текст

In [5]:
X_df = pd.read_csv('labeled_clean.zip', compression='infer')

In [6]:
X_df

Unnamed: 0.1,Unnamed: 0,абсолютно,абсолютное,абу,аварии,августа,августе,австралии,авто,автобус,...,япония,японские,японский,японцы,ярко,ясен,ясно,ящик,ёмкости,toxic
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14407,14407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14408,14408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14409,14409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14410,14410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Разделим данные на две части - для обучения и валидации и попробуем обучить две модели
Используем метрику **roc-auc**
* Модель 1 - логистическая регрессия
* Модель 2 - CatBoost

In [94]:
X = X_df.drop(['toxic'], axis=1)
y = X_df['toxic']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=1, stratify=y)

In [101]:
lr = LogisticRegression()
t0 = time()
lr.fit(X_train, y_train)
print(f"finished in {time() - t0:.3f} s")

finished in 0.256 s


In [96]:
roc_auc_score(y_valid, lr.predict_proba(X_valid)[:,1])

0.9062523772705958

In [98]:
t0 = time()
cb = CatBoostClassifier()
cb.fit(X_train, y_train, verbose=False)
print(f"finished in {time() - t0:.3f} s")

finished in 29.498 s


In [99]:
roc_auc_score(y_valid, cb.predict_proba(X_valid)[:,1])

0.8702904108338532

## Попробуем перебрать параметры для обоих моделей и выбрать наилучший вариант

In [109]:
params = {
    'C': [i/10 for i in range(1, 10)],
    'max_iter': [i for i in range(100, 1100, 100)]
    }
lr = LogisticRegression()
gs = GridSearchCV(lr, param_grid=params, scoring='roc_auc', cv=5, 
                  n_jobs=8)

In [110]:
t0 = time()
gs.fit(X, y)
print(f"finished in {time() - t0:.3f} s")

finished in 425.179 s


In [111]:
gs.best_score_

0.906465714258054

In [112]:
gs.best_params_

{'C': 0.9, 'max_iter': 100}

In [114]:
cb = CatBoostClassifier()
params = {
    'depth': [5, 6, 7]
}
cv_cb = GridSearchCV(cb, param_grid=params, scoring='roc_auc', cv=5, 
                  n_jobs=8)

In [115]:
t0=time()
cv_cb.fit(X, y, verbose=False)
print(f"finished in {time() - t0:.3f} s")

finished in 589.425 s


In [116]:
cv_cb.best_score_

0.8647645587581534

## Логистическая регрессия показала лучший результат, сохраняем модель в файл

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(gs.best_estimator_, f)