In [None]:
import pandas as pd
import numpy as np 
import random
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.woe import WOEEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

np.random.seed(42)
random.seed(42)

In [None]:
df_identity = pd.read_csv("identity.csv")
df_identity

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,-15.0,145955.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 66.0 for android,,,,F,F,T,F,mobile,F3111 Build/33.3.A.1.97
144229,3577526,-5.0,172059.0,,,1.0,-5.0,,,,...,chrome 55.0 for android,32.0,855x480,match_status:2,T,F,T,F,mobile,A574BL Build/NMF26F
144230,3577529,-20.0,632381.0,,,-1.0,-36.0,,,,...,chrome 65.0 for android,,,,F,F,T,F,mobile,Moto E (4) Plus Build/NMA26.42-152
144231,3577531,-5.0,55528.0,0.0,0.0,0.0,-7.0,,,0.0,...,chrome 66.0,24.0,2560x1600,match_status:2,T,F,T,F,desktop,MacOS


In [None]:
df_transaction = pd.read_csv("transaction.csv")

In [None]:
merge_df = pd.merge(df_transaction, df_identity, on= 'TransactionID', how = 'left')

In [None]:
# Наш файл очень много весит, попробуем уменьшить его

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
main_df = reduce_mem_usage(merge_df)

Mem. usage decreased to 650.48 Mb (66.8% reduction)


# Preprocess

In [None]:
# Добавим новый признак, а именно переведем секунды в часы

def make_hour_feature(f):
    #Creates an hour of the day feature, encoded as 0-23.  
    hours = f / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

main_df['hour'] = make_hour_feature(main_df['TransactionDT'])

  main_df['hour'] = make_hour_feature(main_df['TransactionDT'])


In [None]:
# Создаим списики с категориальными и числовыми признаками

list_cat_col = main_df.select_dtypes(include=["object"]).columns.to_list()

exclude = ['TransactionID', 'TransactionDT', 'isFraud']

num_features = [f for f in main_df.columns if (f not in list_cat_col) & (f not in exclude)]

In [None]:
# Уберем признаки с 90% пропусками
col_na = main_df.isna().sum()
to_drop = col_na[(col_na / main_df.shape[0]) > 0.9].index

use_cols = [f for f in main_df.columns if f not in to_drop]
list_cat_col = [f for f in list_cat_col if f not in to_drop]
num_features = [f for f in num_features if f not in to_drop]

main_df[list_cat_col] = main_df[list_cat_col].astype(str)
main_df[num_features] = main_df[num_features].astype(np.float64)
main_df = main_df[use_cols]

In [None]:
# Заполним средним числом числовые признаки, а категориальные 'missing'
median_values = main_df[num_features].median() 

main_df[num_features] = main_df[num_features].fillna(median_values)

main_df[list_cat_col] = main_df[list_cat_col].replace("nan", "missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df[num_features] = main_df[num_features].fillna(median_values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df[list_cat_col] = main_df[list_cat_col].replace("nan", "missing")


In [None]:
main_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,hour
0,2987000,0,86400,68.500000,W,13926.0,361.0,150.0,discover,142.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,0.0
1,2987001,0,86401,29.000000,W,2755.0,404.0,150.0,mastercard,102.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,0.0
2,2987002,0,86469,59.000000,W,4663.0,490.0,150.0,visa,166.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,0.0
3,2987003,0,86499,50.000000,W,18132.0,567.0,150.0,mastercard,117.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,0.0
4,2987004,0,86506,50.000000,H,4497.0,514.0,150.0,mastercard,102.0,...,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.000000,W,6550.0,361.0,150.0,visa,226.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,23.0
590536,3577536,0,15811049,39.500000,W,10444.0,225.0,150.0,mastercard,224.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,23.0
590537,3577537,0,15811079,30.953125,W,12037.0,595.0,150.0,mastercard,224.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,23.0
590538,3577538,0,15811088,117.000000,W,7826.0,481.0,150.0,mastercard,224.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,23.0


# Split data

In [None]:
y = main_df['isFraud']
main_df = main_df.drop('isFraud', axis=1)
data = main_df.drop(columns=['TransactionID', 'TransactionDT'])

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

# Undersampling

In [None]:
# На таргет не сбалансирован, попробуем это исправит с помощью метода RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(X_train, y_train)

print(f'До отбора: {Counter(y_train)}')
print(f'После отбора: {Counter(y_train_rus)}')

До отбора: Counter({0: 455826, 1: 16606})
После отбора: Counter({0: 16606, 1: 16606})


In [None]:
X_train_rus

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,hour
0,53.937500,W,11333.0,555.0,150.0,visa,226.0,debit,433.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,13.0
1,200.000000,W,18132.0,567.0,150.0,mastercard,117.0,debit,272.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,0.0
2,100.000000,H,15497.0,490.0,150.0,visa,226.0,debit,299.0,87.0,...,24.0,2880x1800,match_status:2,T,F,T,T,desktop,Windows,21.0
3,39.000000,W,17188.0,321.0,150.0,visa,226.0,debit,315.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,19.0
4,226.000000,W,17570.0,555.0,150.0,visa,226.0,debit,181.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33207,40.000000,W,3507.0,361.0,150.0,visa,226.0,credit,204.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,21.0
33208,335.000000,W,6019.0,583.0,150.0,visa,226.0,credit,325.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,13.0
33209,141.000000,W,7919.0,194.0,150.0,mastercard,202.0,debit,469.0,87.0,...,24.0,missing,missing,missing,missing,missing,missing,missing,missing,5.0
33210,3.828125,C,9917.0,142.0,185.0,visa,138.0,debit,299.0,87.0,...,24.0,missing,missing,F,F,T,F,mobile,RNE-L03 Build/HUAWEIRNE-L03,8.0


# Закодируем фичи с помощью ohe и woe, отберем признаки по Log Reg, и обучим на модели Log Reg и Random Forest

Категориальные фичи попробуем закодировать с помощью двух методов. One Hot Encoded - для фичией с менее 5ти уникальными признаками и Weight of Evidence - для фичей с более 5ти уникальными признками

In [None]:
# для начала отберем эти признаки
to_ohe=[]
to_emb=[]
for c in list_cat_col:
    if X_train_rus[c].nunique() < 5:
        to_ohe.append(c)
    else:
        to_emb.append(c)

In [None]:
scaler = StandardScaler()

# Категориальный фичи < 5 уникальных значений будем кадировать OHE
ohe = OneHotEncoder(handle_unknown='ignore')

# Категориальный фичи >= 5 уникальных значений будем кадировать Weight of Evidence
woe = WOEEncoder()


column_trans = ColumnTransformer(
    [ ('scaler',scaler, num_features),
    ('ohe', ohe, to_ohe),
    ('woe', woe, to_emb)], remainder='passthrough', n_jobs=-1)



train_X_transformed = column_trans.fit_transform(X_train_rus, y_train_rus)
test_X_transformed = column_trans.transform(X_test)


print(train_X_transformed.shape)

(33212, 465)


In [None]:
train_X_transformed = pd.DataFrame(train_X_transformed)

Выберем признаки с помощью модели Log Reg

In [None]:
feature_log_reg = LogisticRegression(penalty='l1', max_iter=5000, solver='liblinear').fit(train_X_transformed, y_train_rus)
model = SelectFromModel(feature_log_reg, threshold=0.1, prefit=True)
X_train_new = model.transform(train_X_transformed)
X_test_new = model.transform(test_X_transformed)


Используем модель Log Reg для обучения модели, подбираем нужные параметры с помощью GridSearch

In [None]:
grid={"C":np.logspace(0,3,100), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg = LogisticRegression(max_iter=10000)
logreg_cv = GridSearchCV(logreg,grid, scoring='roc_auc', cv=3, verbose=2)
logreg_cv.fit(X_train_new,y_train_rus)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l2; total time=   1.8s
[CV] END ..................................C=1.0, penalty=l2; total time=   1.4s
[CV] END ..................................C=1.0, penalty=l2; total time=   1.5s
[CV] END ...................C=1.0722672220103233, penalty=l1; total time=   0.0s
[CV] END ...................C=1.0722672220103233, penalty=l1; total time=   0.0s
[CV] END ...................C=1.0722672220103233, penalty=l1; total time=   0.0s
[CV] END ...................C=1.0722672220103233, penalty=l2; total time=   1.8s
[CV] END ...................C=1.0722672220103233, penalty=l2; total time=   1.7s
[CV] END ...................C=1.07226722201032

300 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.87252193   

tuned hpyerparameters :(best parameters)  {'C': 6.1359072734131725, 'penalty': 'l2'}
accuracy : 0.8727298168641365


In [None]:
logreg_cv.best_params_

In [None]:
best_estim = logreg_cv.best_estimator_
best_estim.fit(X_train_new,y_train_rus)
prediction = best_estim.predict(X_test_new)

In [None]:
ras = roc_auc_score(y_test, prediction)
f = f1_score(y_test, prediction, average='micro')
cm = confusion_matrix(y_test, prediction)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.7875878016136968, f1: 0.2311445806278096
precision_score: 0.13691397360036414
recall_score: 0.7414345575548434

confusion_matrix:
[[95089 18962]
 [ 1049  3008]]


Далее попробуем обучить на RandomForest

In [None]:
clf_rf_down = RandomForestClassifier(random_state=42)
model_rf_down = clf_rf_down.fit(train_X_transformed, y_train_rus)
x_prob_rf = model_rf_down.predict(test_X_transformed)


In [None]:
ras = roc_auc_score(y_test, x_prob_rf)
f = f1_score(y_test, x_prob_rf, average='micro')
cm = confusion_matrix(y_test, x_prob_rf)
pr = precision_score(y_test, x_prob_rf)
rc = recall_score(y_test, x_prob_rf)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.8519471504113528, f1: 0.8699326040573034
precision_score: 0.18703283317645755
recall_score: 0.8326349519349273

confusion_matrix:
[[99368 14683]
 [  679  3378]]


# Попробуем использовать PCA для выбора фичей, вместо Log Reg

In [None]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(train_X_transformed)
X_test_pca = pca.transform(test_X_transformed)

Для начала обучим на Log Reg

In [None]:
grid={"C":np.logspace(0,3,20), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg = LogisticRegression(max_iter=5000)
logreg_cv = GridSearchCV(logreg,grid, scoring='roc_auc', cv=3, verbose=2)
logreg_cv.fit(X_train_pca,y_train_rus)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l1; total time=   0.0s
[CV] END ..................................C=1.0, penalty=l2; total time=   0.4s
[CV] END ..................................C=1.0, penalty=l2; total time=   0.5s
[CV] END ..................................C=1.0, penalty=l2; total time=   0.5s
[CV] END ...................C=1.4384498882876628, penalty=l1; total time=   0.0s
[CV] END ...................C=1.4384498882876628, penalty=l1; total time=   0.0s
[CV] END ...................C=1.4384498882876628, penalty=l1; total time=   0.0s
[CV] END ...................C=1.4384498882876628, penalty=l2; total time=   0.4s
[CV] END ...................C=1.4384498882876628, penalty=l2; total time=   0.5s
[CV] END ...................C=1.438449888287662

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.8503982      

tuned hpyerparameters :(best parameters)  {'C': 18.329807108324356, 'penalty': 'l2'}
accuracy : 0.8504361993563411


In [None]:
best_estim_pca = logreg_cv.best_estimator_
best_estim_pca.fit(X_train_pca,y_train_rus)
prediction_pca = best_estim_pca.predict(X_test_pca)

In [None]:
ras = roc_auc_score(y_test, prediction_pca)
f = f1_score(y_test, prediction_pca, average='micro')
cm = confusion_matrix(y_test, prediction_pca)
pr = precision_score(y_test, prediction_pca)
rc = recall_score(y_test, prediction_pca)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.7632612095898952, f1: 0.8203169980018288
precision_score: 0.12457897729758104
recall_score: 0.7019965491742667

confusion_matrix:
[[94038 20013]
 [ 1209  2848]]


Обучим на модели RandomForest

In [None]:
clf_rf_down = RandomForestClassifier(random_state=42)
model_rf_down = clf_rf_down.fit(X_train_pca, y_train_rus)
# y_pred = model_rf_down.predict(X_test)
x_prob_rf = model_rf_down.predict(X_test_pca)

ROC_AUC: 0.8123089139834864


In [None]:
ras = roc_auc_score(y_test, x_prob_rf)
f = f1_score(y_test, x_prob_rf, average='micro')
cm = confusion_matrix(y_test, x_prob_rf)
pr = precision_score(y_test, x_prob_rf)
rc = recall_score(y_test, x_prob_rf)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.8519471504113528, f1: 0.8699326040573034
precision_score: 0.18703283317645755
recall_score: 0.8326349519349273

confusion_matrix:
[[99368 14683]
 [  679  3378]]


# XGBoost Model

In [None]:
# подбираем оптимальные параметры для XGBoost
parameters = {
              'max_depth': [5, 10, 15, 20, 25],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'n_estimators': range(100, 1000,200 ),
              'min_child_weight': [0, 2, 5, 10, 20],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]}
 


estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

clf = GridSearchCV(estimator=estimator, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   cv=4,
                   verbose=2)
clf.fit(train_X_transformed, y_train_rus)

print("Best parameters:", clf.best_params_)
print("accuracy :",clf.best_score_)

Fitting 4 folds for each of 54 candidates, totalling 216 fits
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.5s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   7.1s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   7.2s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   7.2s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   7.3s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=1000; total time=  14.3s
[CV] END colsample_bytree=0.

In [None]:
best_estim_XGB = clf.best_estimator_
best_estim_XGB.fit(train_X_transformed,y_train_rus)
prediction_XGB = best_estim_XGB.predict(test_X_transformed)

In [None]:
ras = roc_auc_score(y_test, prediction_XGB)
f = f1_score(y_test, prediction_XGB, average='micro')
cm = confusion_matrix(y_test, prediction_XGB)
pr = precision_score(y_test, prediction_XGB)
rc = recall_score(y_test, prediction_XGB)


print(f'ROC_AUC: {ras}, f1: {f}')
print(f'precision_score: {pr}')
print(f'recall_score: {rc}')
print(f'\nconfusion_matrix:\n{cm}')

ROC_AUC: 0.8971257441192427, f1: 0.9094388187083009
precision_score: 0.2596481065817102
recall_score: 0.8839043628296771

confusion_matrix:
[[103826  10225]
 [   471   3586]]


### Результаты

Undersampling| Encoder | Feature selection | Model | ROC-AUC | F1-score |precision_score | recall_score 
---| --- |---| --- | --- | --- | ---| --- 
Random Sampler| ohe и woe | Logistic Regression | Logistic Regression | 0.78 | 0.23 | 0.13 | 0.74 
Random Sampler| ohe и woe| PCA | Logistic Regression | 0.76 | 0.82 | 0.12 | 0.70 
Random Sampler| ohe и woe | Logistic Regression | RandomForest | 0.85 | 0.87 | 0.18 | 0.83
Random Sampler| ohe и woe| PCA |  RandomForest | 0.85 | 0.87 | 0.19 | 0.83
Random Sampler| ohe и woe | | XGBoost | 0.9 | 0.91 | 0.25 | 0.88