In [None]:
!pip install category_encoders
!pip install catboost
import pandas as pd
import numpy as np 
import random
import pickle
from tqdm import tqdm
from collections import Counter
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score


np.random.seed(123)
random.seed(123)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


###Загрузка данных

In [None]:
# Получаем доступ к гугл диску, чтобы взять оттуда данные
drive.mount('/drive')

merge_without_drop = "/drive/My Drive/merge_filled_without drop.csv"
merge_without_drop = pd.read_csv(merge_without_drop)

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


###Разделяем данные на трейн и тест

In [None]:
# Отделяем целевую переменную
y = merge_without_drop['isFraud']
X = merge_without_drop.drop('isFraud', axis=1)

# Делим на трейн и тест
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

# Удаляем ненужный столбец с ID транзакции
X_test.drop('TransactionID', axis=1, inplace=True)
X_train.drop('TransactionID', axis=1, inplace=True)

# Ресетим индексы
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Удаляем ненужные данные
del X, y, merge_without_drop

Также найдем категориальные колонки для последующей их обработки

In [None]:
categor_cols = []

for col in X_train.columns:
  if X_train[col].dtypes == 'O':
    categor_cols.append(col)

###Undersampling

Чтобы нивелировать сильный дисбаланс классов в целевой переменной будем производить отбор объектов для нашей выборки используя метод undersampling

* $\color{Crimson}{Random\,Undersampler:}$

In [None]:
# Определяем метод
rus = RandomUnderSampler(random_state=123) 

# Отбираем объекты
X_rus, y_rus = rus.fit_resample(X_train, y_train)

# Оценим распределение классов до и после отбора
print(f'До отбора: {Counter(y_train)}')
print(f'После отбора: {Counter(y_rus)}')

До отбора: Counter({0: 427399, 1: 15506})
После отбора: Counter({0: 15506, 1: 15506})


* $\color{Crimson}{Near\,Miss:}$

Сначала необходимо обработать категориальные признаки

In [None]:
loo = LeaveOneOutEncoder(cols=categor_cols, sigma = 0.48)
X_train_loo = loo.fit_transform(X_train, y_train)

X_test_loo = loo.transform(X_test)

Попробуем подобрать лучшее количество соседей

In [None]:
# Разделим тренировочные данные для подбора гиперпараметров
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train_loo, y_train, random_state=123)

neighbors = [x for x in range(3, 25, 3)]

for n in neighbors:
  undersample = NearMiss(version=1, n_neighbors=n)

  # Трансформируем данные
  X_nm, y_nm = undersample.fit_resample(X_train_2, y_train_2)

  # Оценим качество предсказания для данного гиперпараметра
  log = LogisticRegression(max_iter=6000)
  log.fit(X_nm, y_nm)

  pred = log.predict(X_val)

  ras = roc_auc_score(y_val, pred)
  f = f1_score(y_val, pred, average='micro')

  print(f'For n_neighbors = {n} results are:')
  print(f'\t roc_auc = {ras}, f1 = {f}')

For n_neighbors = 3 results are:
	 roc_auc = 0.7127729942186086, f1 = 0.6737200502135884
For n_neighbors = 6 results are:
	 roc_auc = 0.7080975306033956, f1 = 0.6900665600982597
For n_neighbors = 9 results are:
	 roc_auc = 0.7091703348562013, f1 = 0.7011659306221608
For n_neighbors = 12 results are:
	 roc_auc = 0.7075905073043476, f1 = 0.6951872623659993
For n_neighbors = 15 results are:
	 roc_auc = 0.707359017366411, f1 = 0.6725369602716591
For n_neighbors = 18 results are:
	 roc_auc = 0.701739956775897, f1 = 0.6687619099226024
For n_neighbors = 21 results are:
	 roc_auc = 0.7063286571637879, f1 = 0.6751831080043711
For n_neighbors = 24 results are:
	 roc_auc = 0.7043361913797811, f1 = 0.6718234938181292


Наиболее хорошие результаты для двух метрик при количестве соседей равных 9

Произведем отбор при помощи найденного гиперпараметра:

In [None]:
# Определяем метод
undersample = NearMiss(version=1, n_neighbors=9)

# Отбираем объекты
X_nm, y_nm = undersample.fit_resample(X_train_loo, y_train)

# Оценим распределение классов до и после отбора
print(f'До отбора: {Counter(y_train)}')
print(f'После отбора: {Counter(y_nm)}')

До отбора: Counter({0: 427399, 1: 15506})
После отбора: Counter({0: 15506, 1: 15506})


In [None]:
# сохраним модель
pickle.dump(undersample, open('near_miss.pkl', 'wb'))

# загрузим
# undersample = pickle.load(open('near_miss.pkl', 'rb'))

###Logistic Regression

Для удобства создадим Pipeline со всеми используемыми методами.

* Сначала подберем гиперпараметры для паплайна, используя данные, отобранные методом $\color{Crimson}{RandomSampler}$:

In [None]:
pipe_lr_rs = Pipeline([
    ('encoder', LeaveOneOutEncoder(cols=categor_cols)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', LogisticRegression(max_iter=6000, class_weight='balanced'))])

In [None]:
params = {'encoder__sigma': [x/100 for x in range(48, 63, 1)],
          'model__C' : np.arange(10, 100, 10)}

gs = GridSearchCV(pipe_lr_rs, params, cv=3, scoring='roc_auc', verbose=2)

gs.fit(X_rus, y_rus)

gs.best_score_, gs.best_estimator_

Fitting 3 folds for each of 135 candidates, totalling 405 fits
[CV] END ...................encoder__sigma=0.48, model__C=10; total time= 2.3min
[CV] END ...................encoder__sigma=0.48, model__C=10; total time=  57.4s
[CV] END ...................encoder__sigma=0.48, model__C=10; total time=  50.7s
[CV] END ...................encoder__sigma=0.48, model__C=20; total time= 1.3min
[CV] END ...................encoder__sigma=0.48, model__C=20; total time=  57.4s
[CV] END ...................encoder__sigma=0.48, model__C=20; total time= 1.9min
[CV] END ...................encoder__sigma=0.48, model__C=30; total time= 2.2min
[CV] END ...................encoder__sigma=0.48, model__C=30; total time=  53.5s
[CV] END ...................encoder__sigma=0.48, model__C=30; total time=  57.6s
[CV] END ...................encoder__sigma=0.48, model__C=40; total time= 1.4min
[CV] END ...................encoder__sigma=0.48, model__C=40; total time=  58.9s
[CV] END ...................encoder__sigma=0.4

(0.8559840620509086, Pipeline(steps=[('encoder',
                  LeaveOneOutEncoder(cols=['ProductCD', 'card4', 'card6',
                                           'P_emaildomain', 'R_emaildomain',
                                           'M1', 'M2', 'M3', 'M4', 'M5', 'M6',
                                           'M7', 'M8', 'M9', 'id_12', 'id_15',
                                           'id_16', 'id_23', 'id_27', 'id_28',
                                           'id_29', 'id_30', 'id_31', 'id_33',
                                           'id_34', 'id_35', 'id_36', 'id_37',
                                           'id_38', 'DeviceType', ...],
                                     sigma=0.48)),
                 ('scaler', StandardScaler()),
                 ('selector',
                  SelectFromModel(estimator=LogisticRegression(class_weight='balanced',
                                                               max_iter=6000,
                                       

Применим подобранные гиперпараметры и посмотрим на качество предсказания:

In [None]:
pipe_lr_rs = Pipeline([
    ('encoder', LeaveOneOutEncoder(cols=categor_cols, sigma=0.48)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', LogisticRegression(C=50, max_iter=6000, class_weight='balanced'))])

pipe_lr_rs.fit(X_rus, y_rus)

In [None]:
# сохраним модель
# pickle.dump(pipe_lr_rs, open('pipe_lr_rs.pkl', 'wb'))

# загрузим
pipe_lr_rs = pickle.load(open('pipe_lr_rs.pkl', 'rb'))

In [None]:
pred = pipe_lr_rs.predict(X_test)

# Оценим качество
ras = roc_auc_score(y_test, pred)
f = f1_score(y_test, pred, average='micro')
cm = confusion_matrix(y_test, pred)
pr = precision_score(y_test, pred)
rc = recall_score(y_test, pred)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.7704267842385979, f1: 0.8360957767467063
precision_score: 0.1374385924825774
recall_score: 0.699825479930192

confusion_matrix:
[[119828  22650]
 [  1548   3609]]


* Теперь будем производить подбор гиперпараметров для данных отобранных методом $\color{Crimson}{NearMiss}$:

In [None]:
pipe_lr_nm = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', LogisticRegression(max_iter=6000, class_weight='balanced'))])

In [None]:
params = {'model__C' : np.arange(10, 100, 10)}

gs = GridSearchCV(pipe_lr_nm, params, cv=3, scoring='roc_auc', verbose=2)

gs.fit(X_nm, y_nm)

gs.best_score_, gs.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ........................................model__C=10; total time=  23.1s
[CV] END ........................................model__C=10; total time=  19.5s
[CV] END ........................................model__C=10; total time=  12.3s
[CV] END ........................................model__C=20; total time=  16.5s
[CV] END ........................................model__C=20; total time=  14.4s
[CV] END ........................................model__C=20; total time=  14.9s
[CV] END ........................................model__C=30; total time=  18.1s
[CV] END ........................................model__C=30; total time=  15.2s
[CV] END ........................................model__C=30; total time=  14.9s
[CV] END ........................................model__C=40; total time=  27.0s
[CV] END ........................................model__C=40; total time=  15.3s
[CV] END ........................................

(0.9181075738012341, Pipeline(steps=[('scaler', StandardScaler()),
                 ('selector',
                  SelectFromModel(estimator=LogisticRegression(class_weight='balanced',
                                                               max_iter=6000,
                                                               penalty='l1',
                                                               solver='liblinear'),
                                  threshold=0.1)),
                 ('model',
                  LogisticRegression(C=30, class_weight='balanced',
                                     max_iter=6000))]))

Применим данные гиперпараметры:

In [None]:
pipe_lr_nm = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', LogisticRegression(C=30, max_iter=6000, class_weight='balanced'))])

pipe_lr_nm.fit(X_nm, y_nm)

In [None]:
# сохраним модель
# pickle.dump(pipe_lr_nm, open('pipe_lt_nm.pkl', 'wb'))

# загрузим
pipe_lr_nm = pickle.load(open('pipe_lt_nm.pkl', 'rb'))

In [None]:
pred = pipe_lr_nm.predict(X_test_loo)

# Оценим качество
ras = roc_auc_score(y_test, pred)
f = f1_score(y_test, pred, average='micro')
cm = confusion_matrix(y_test, pred)
pr = precision_score(y_test, pred)
rc = recall_score(y_test, pred)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.7215573083260822, f1: 0.6919903816845598
precision_score: 0.08079274632949299
recall_score: 0.7533449680046539

confusion_matrix:
[[98277 44201]
 [ 1272  3885]]


###SVC

* Подберем гиперпараметры для паплайна, используя данные, отобранные методом $\color{Crimson}{RandomSampler}$:

In [None]:
pipe = Pipeline([
    ('encoder', LeaveOneOutEncoder(cols=categor_cols)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', SVC(class_weight='balanced'))])

In [None]:
params = {'encoder__sigma': [x/100 for x in range(50, 58, 5)],
          'model__C' : [5, 10, 15],
          'model__gamma': [1, 0.1, 0.01],
          'model__kernel': ['linear', 'rbf']}

gs = GridSearchCV(pipe, params, cv=2, scoring='roc_auc', verbose=2)

gs.fit(X_rus, y_rus)

gs.best_score_, gs.best_estimator_

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=1, model__kernel=linear; total time= 3.9min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=1, model__kernel=linear; total time= 4.3min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=1, model__kernel=rbf; total time= 4.8min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=1, model__kernel=rbf; total time= 4.0min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.1, model__kernel=linear; total time= 5.0min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.1, model__kernel=linear; total time= 4.3min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.1, model__kernel=rbf; total time= 3.0min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.1, model__kernel=rbf; total time= 2.9min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.01, model__kernel=linear; total time= 3.8min
[CV] END encoder__sigma=0.5, model__C=5, model__gamma=0.01

(0.8679204601426719, Pipeline(steps=[('encoder',
                  LeaveOneOutEncoder(cols=['ProductCD', 'card4', 'card6',
                                           'P_emaildomain', 'R_emaildomain',
                                           'M1', 'M2', 'M3', 'M4', 'M5', 'M6',
                                           'M7', 'M8', 'M9', 'id_12', 'id_15',
                                           'id_16', 'id_23', 'id_27', 'id_28',
                                           'id_29', 'id_30', 'id_31', 'id_33',
                                           'id_34', 'id_35', 'id_36', 'id_37',
                                           'id_38', 'DeviceType', ...],
                                     sigma=0.55)),
                 ('scaler', StandardScaler()),
                 ('selector',
                  SelectFromModel(estimator=LogisticRegression(class_weight='balanced',
                                                               max_iter=6000,
                                       

(0.8679204601426719, Pipeline(steps=[('encoder',
                  LeaveOneOutEncoder(cols=['ProductCD', 'card4', 'card6',
                                           'P_emaildomain', 'R_emaildomain',
                                           'M1', 'M2', 'M3', 'M4', 'M5', 'M6',
                                           'M7', 'M8', 'M9', 'id_12', 'id_15',
                                           'id_16', 'id_23', 'id_27', 'id_28',
                                           'id_29', 'id_30', 'id_31', 'id_33',
                                           'id_34', 'id_35', 'id_36', 'id_37',
                                           'id_38', 'DeviceType', ...],
                                     sigma=0.55)),
                 ('scaler', StandardScaler()),
                 ('selector',
                  SelectFromModel(estimator=LogisticRegression(class_weight='balanced',
                                                               max_iter=6000,
                                       

Применим подобранные гиперпараметры:

In [None]:
pipe_svc_rs = Pipeline([
    ('encoder', LeaveOneOutEncoder(cols=categor_cols, sigma=0.55)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced',  solver='liblinear'), threshold=0.1)), 
    ('model', SVC(C=10, class_weight='balanced',  gamma=0.01, kernel='rbf'))])

pipe_svc_rs.fit(X_rus, y_rus)

In [None]:
# сохраним модель
#pickle.dump(pipe_svc_rs, open('pipe_svc_rs.pkl', 'wb'))

# загрузим
pipe_svc_rs = pickle.load(open('pipe_svc_rs.pkl', 'rb'))

In [None]:
pred = pipe_svc_rs.predict(X_test)

# Оценим качество
ras = roc_auc_score(y_test, pred)
f = f1_score(y_test, pred, average='micro')
cm = confusion_matrix(y_test, pred)
pr = precision_score(y_test, pred)
rc = recall_score(y_test, pred)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.805156854101528, f1: 0.8713855115656857
precision_score: 0.17686089435073127
recall_score: 0.7339538491370952

confusion_matrix:
[[124862  17616]
 [  1372   3785]]


* Посмотрим на качество предсказания для данных отобранных методом $\color{Crimson}{NearMiss}$:

In [None]:
pipe_svc_nm = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1', max_iter=6000, class_weight='balanced', solver='liblinear'), threshold=0.1)), 
    ('model', SVC(C=10, gamma=0.1, kernel='rbf'))])

pipe_svc_nm.fit(X_nm, y_nm)

In [None]:
# сохраним модель
#pickle.dump(pipe_svc_nm, open('pipe_svc_nm.pkl', 'wb'))

# загрузим
pipe_svc_nm = pickle.load(open('pipe_svc_nm.pkl', 'rb'))

In [None]:
pred = pipe_svc_nm.predict(X_test_loo)

# Оценим качество
ras = roc_auc_score(y_test, pred)
f = f1_score(y_test, pred, average='micro')
cm = confusion_matrix(y_test, pred)
pr = precision_score(y_test, pred)
rc = recall_score(y_test, pred)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.7370811457828585, f1: 0.642232532935957
precision_score: 0.07683429220825343
recall_score: 0.8390537133992632

confusion_matrix:
[[90489 51989]
 [  830  4327]]


### CatBoost

Подберем гиперпараметры, разделив трейн на две части:

In [None]:
# Разделяем трейн
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train, y_train, random_state=123)

# Применим Random Undersampler для данных, на которых будем тренероваться
rus = RandomUnderSampler() 
X_rus, y_rus = rus.fit_resample(X_train_2, y_train_2)

# Выбираем промежутки гиперпараметров
iterat = [x for x in range(2500, 5500, 1000)]
l_r = [x/100 for x in range(9, 60, 10)]

# Отбираем гиперпараметры
for i in iterat:
  for l in l_r:
    cat_boo = CatBoostClassifier(eval_metric='Logloss', iterations=i, learning_rate=l)

    # Закодируем категориальные признаки
    pool_train = Pool(X_rus, y_rus,
                      cat_features=categor_cols,
                      feature_names=list(X_rus.columns))

    pool_val = Pool(X_val,
                    cat_features=categor_cols,
                    feature_names=list(X_val.columns))
    

    cat_boo.fit(pool_train, verbose=False)
    pred = cat_boo.predict(pool_val)

    # Оценим качество модели
    ras = roc_auc_score(y_val, pred)
    f = f1_score(y_val, pred, average='micro')

    print(f'For iter {i} and l_r {l} results are:')
    print(f'\t\t * ROC_AUC: {ras} \n\t\t * f1 {f}')

For iter 2500 and l_r 0.09 results are:
		 * ROC_AUC: 0.8801078237708918 
		 * f1 0.8953552430753113
For iter 2500 and l_r 0.19 results are:
		 * ROC_AUC: 0.8829077081924781 
		 * f1 0.8907583516215557
For iter 2500 and l_r 0.29 results are:
		 * ROC_AUC: 0.8811968533869559 
		 * f1 0.8847706521444634
For iter 2500 and l_r 0.39 results are:
		 * ROC_AUC: 0.8783574258468911 
		 * f1 0.8812394447605372
For iter 2500 and l_r 0.49 results are:
		 * ROC_AUC: 0.8743993968356328 
		 * f1 0.8782320481905949
For iter 2500 and l_r 0.59 results are:
		 * ROC_AUC: 0.8740817613592544 
		 * f1 0.8763987103416512
For iter 3500 and l_r 0.09 results are:
		 * ROC_AUC: 0.8823537625988678 
		 * f1 0.8962764276102486
For iter 3500 and l_r 0.19 results are:
		 * ROC_AUC: 0.8846247840074348 
		 * f1 0.8918782230169697
For iter 3500 and l_r 0.29 results are:
		 * ROC_AUC: 0.8820391383557792 
		 * f1 0.884933214121217
For iter 3500 and l_r 0.39 results are:
		 * ROC_AUC: 0.8791152970121683 
		 * f1 0.88197097

Наиболее хорошие гиперпараметры: 3500 итераций и learnin rate = 0.19

Применим подобранные гиперпараметры:

In [None]:
cat_boo = CatBoostClassifier(eval_metric='Logloss', iterations=3500, learning_rate=0.19)

# Закодируем категориальные признаки
pool_train = Pool(X_rus, y_rus,
                  cat_features=categor_cols,
                  feature_names=list(X_rus.columns))

pool_test = Pool(X_test,
                cat_features=categor_cols,
                feature_names=list(X_test.columns))


cat_boo.fit(pool_train)

0:	learn: 0.6120213	total: 427ms	remaining: 24m 53s
1:	learn: 0.5607972	total: 772ms	remaining: 22m 29s
2:	learn: 0.5308029	total: 1.09s	remaining: 21m 12s
3:	learn: 0.5089092	total: 1.37s	remaining: 19m 56s
4:	learn: 0.4939633	total: 1.74s	remaining: 20m 14s
5:	learn: 0.4832403	total: 2.04s	remaining: 19m 45s
6:	learn: 0.4736755	total: 2.38s	remaining: 19m 48s
7:	learn: 0.4664802	total: 2.69s	remaining: 19m 34s
8:	learn: 0.4593286	total: 3.05s	remaining: 19m 43s
9:	learn: 0.4535384	total: 3.36s	remaining: 19m 34s
10:	learn: 0.4488778	total: 3.7s	remaining: 19m 33s
11:	learn: 0.4462314	total: 4s	remaining: 19m 23s
12:	learn: 0.4409793	total: 4.3s	remaining: 19m 14s
13:	learn: 0.4374476	total: 4.6s	remaining: 19m 4s
14:	learn: 0.4346175	total: 4.9s	remaining: 18m 59s
15:	learn: 0.4324369	total: 5.23s	remaining: 18m 58s
16:	learn: 0.4299560	total: 5.56s	remaining: 18m 58s
17:	learn: 0.4270235	total: 5.91s	remaining: 19m 3s
18:	learn: 0.4251142	total: 6.23s	remaining: 19m 1s
19:	learn: 0.

In [None]:
# сохраним модель
pickle.dump(cat_boo, open('cat_boo.pkl', 'wb'))

# загрузим
# cat_boo = pickle.load(open('cat_boo.pkl', 'rb'))

In [None]:
pred = cat_boo.predict(pool_test)

# Оценим качество модели
ras = roc_auc_score(y_test, pred)
f = f1_score(y_test, pred, average='micro')
cm = confusion_matrix(y_test, pred)
pr = precision_score(y_test, pred)
rc = recall_score(y_test, pred)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.8927433504507, f1: 0.8984929047989977
precision_score: 0.24097401570653032
recall_score: 0.8865619546247818

confusion_matrix:
[[128077  14401]
 [   585   4572]]


--------------------------------------------------

### Результаты

Undersampling | Model | ROC-AUC | F1-score |precision_score | recall_score 
--- | --- | --- | --- | ---| --- 
Random Sampler | Logistic Regression | 0.77 | 0.83 | 0.14 | 0.70 
Near Miss | Logistic Regression | 0.72 | 0.69 | 0.08 | 0.75
Random Sampler | SVC | 0.81 | 0.87 | 0.18 | 0.73
Near Miss | SVC | 0.73 | 0.64 | 0.07 | 0.83
Random Sampler | CatBoost Classifier | 0.89 | 0.90 | 0.24 | 0.89