In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import random
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

In [2]:
df = pd.read_csv('data/df_tuned.csv')
df.shape

(3000000, 316)

In [3]:
df = df.drop(['id'], axis=1)

In [4]:
# Преобразуем все колонки, кроме новых фичей, в которых нецелочисленные значения, в int8
rate_cols = ['no_delays_rate',
 'no_delays',
 'no_delays_over_530',
 'is_zero_util_rate',
 'is_zero_over2limit_rate',
 'is_zero_maxover2limit_rate',
 'pclose_flag_rate',
 'fclose_flag_rate',
 'is_zero_loans5_rate',
 'is_zero_loans530_rate',
 'is_zero_loans3060_rate',
 'is_zero_loans6090_rate',
 'is_zero_loans90_rate',             
 'enc_paym_10_3_minus_is_zero_loans530',
 'is_zero_loans530_minus_enc_paym_21_3',
 'pre_maxover2limit_infrequent_sklearn_plus_enc_paym_3_3_rate',
 'enc_paym_17_infrequent_sklearn_plus_enc_paym_4_3_rate',
 'enc_paym_13_infrequent_sklearn_plus_enc_paym_4_3_rate',
 'enc_paym_4_3_plus_enc_paym_1_1_rate',
 'enc_paym_4_3_plus_enc_paym_12_1_rate',
 'enc_paym_4_3_plus_pclose_flag_rate_rate',
 'enc_paym_1_infrequent_sklearn_plus_enc_paym_6_3_rate',
 'fclose_flag_rate_plus_enc_paym_4_3_rate',
 'pre_util_6_plus_enc_loans_credit_type_5_rate',
 'enc_paym_17_infrequent_sklearn_minus_enc_paym_4_3_rate',
 'enc_paym_9_3_minus_pre_maxover2limit_17_rate',
 'enc_paym_17_3_minus_is_zero_loans5_rate',
 'pre_util_5_minus_enc_paym_5_3_rate',
 'pre_util_4_plus_enc_paym_10_3_minus_is_zero_loans530_rate',
 'enc_paym_10_3_minus_is_zero_loans530_plus_pre_util_5_rate',
 'fclose_flag_rate_plus_enc_paym_10_3_minus_is_zero_loans530_rate',
 'enc_paym_0_1_plus_enc_paym_1_infrequent_sklearn_plus_enc_paym_6_3_rate_rate',
 'enc_paym_10_3_minus_is_zero_loans530_plus_pre_maxover2limit_infrequent_sklearn_plus_enc_paym_3_3_rate_rate',
 'is_zero_loans530_minus_enc_paym_21_3_plus_pre_loans_credit_limit_2_rate',
 'enc_paym_4_3_plus_enc_paym_12_1_rate_plus_enc_paym_0_1_rate',
 'enc_paym_0_1_minus_is_zero_loans530_minus_enc_paym_21_3_rate',
 'enc_paym_10_3_minus_is_zero_loans530_minus_pre_util_5_rate',
 'enc_paym_10_3_minus_is_zero_loans530_minus_pre_util_4_rate',
 'is_zero_loans530_minus_enc_paym_21_3_minus_fclose_flag_rate_rate',
 'pre_util_5_minus_enc_paym_5_3_rate_minus_enc_loans_credit_type_5_rate']

df_sep = df[rate_cols]
df=df.drop(rate_cols, axis = 1)
df=df.astype('int8')
df[rate_cols] = df_sep

In [5]:
df_train, df_test = train_test_split(df, stratify=df['flag'], test_size=0.2, random_state=42)

X_train, y_train = df_train.drop('flag', axis = 1), df_train['flag']
X_test, y_test =  df_test.drop('flag', axis = 1), df_test['flag']

In [9]:
print('Размер тренировочного датасета', df_train.shape)
print('Размер тестовой выборки', df_test.shape)

Размер тренировочного датасета (2400000, 315)
Размер тестовой выборки (600000, 315)


**Обучение**

Обучим три вида классификаторов: XGBClassifier, LGBMClassifier, CatBoostClassifier
1) Сначала с дефолтными параметрами
2) Затем попробуем балансировку классов
3) Затем подберем параметры и обучим по 2 сетки с лучшими параметрами
4) Затем попробуем downsampling
5) Соберем результаты полученных сетей в один датасет и возьмем среднее

In [9]:
xgbc = xgb.XGBClassifier()
xgbc.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, xgbc.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, xgbc.predict_proba(X_test)[:, 1]))

Train 0.789461164701711
Test 0.7543490142883822


In [11]:
lgbm = lgb.LGBMClassifier(verbosity= -1)
lgbm.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, lgbm.predict_proba(X_test)[:, 1]))

Train 0.7648197933301526
Test 0.7539722388600716


In [13]:
catb = CatBoostClassifier(random_state=42, verbose = 0)
catb.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, catb.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, catb.predict_proba(X_test)[:, 1]))

Train 0.789179134365892
Test 0.7569423092732506


In [15]:
# С балансировкой
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)
xgbc_w = xgb.XGBClassifier()
xgbc_w.fit(X_train, y_train, sample_weight=classes_weights)
print('Train', roc_auc_score(y_train, xgbc_w.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, xgbc_w.predict_proba(X_test)[:, 1]))

Train 0.7954344447897019
Test 0.7523089057586027


In [17]:
# lgbm с балансировкой
lgbm_bal = lgb.LGBMClassifier(verbosity= -1, class_weight = 'balanced')
lgbm_bal.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, lgbm_bal.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, lgbm_bal.predict_proba(X_test)[:, 1]))

Train 0.764702649126794
Test 0.7545980570946549


In [19]:
# # подбор параметров без балансировки
import random
params =[
    [0.01, 0.03, 0.05],
    [2000, 3000],
    [9,12,15]
]
num_iterations = 20
param_nums=[]
roc_aucs=[]
for i in range(num_iterations):
    learning_rate = random.choice(params[0])
    n_estimators = random.choice(params[1])
    max_depth = random.choice(params[2])
    
    lgbm_search = lgb.LGBMClassifier(learning_rate = learning_rate,
                         n_estimators = n_estimators,
                         max_depth = max_depth,
                         verbosity= -1
                        )

    lgbm_search.fit(X_train, y_train)
    print(f'learning_rate:{learning_rate}, n_estimators: {n_estimators}, max_depth: {max_depth}')
    res = [(roc_auc_score(y_test, lgbm_search.predict_proba(X_test)[:, 1])), (roc_auc_score(y_train, lgbm_search.predict_proba(X_train)[:, 1]))]
    print('Test number', i, res)
    roc_aucs.append(res)
    param_nums.append([learning_rate, n_estimators,max_depth,res])
    
print(f'Лучший результат: {max(b[0] for b in roc_aucs)}')

learning_rate:0.01, n_estimators: 2000, max_depth: 9
Test number 0 [0.757256836926527, 0.778898935420988]
learning_rate:0.01, n_estimators: 2000, max_depth: 15
Test number 1 [0.7572847996372658, 0.7809994844326763]
learning_rate:0.01, n_estimators: 2000, max_depth: 12
Test number 2 [0.7573539776985168, 0.7804076362505721]
learning_rate:0.05, n_estimators: 2000, max_depth: 12
Test number 3 [0.7596164454216148, 0.8386206996394783]
learning_rate:0.01, n_estimators: 2000, max_depth: 9
Test number 4 [0.757256836926527, 0.778898935420988]
learning_rate:0.03, n_estimators: 2000, max_depth: 9
Test number 5 [0.7595800628902072, 0.8117320738038477]
learning_rate:0.01, n_estimators: 3000, max_depth: 12
Test number 6 [0.7585750888781233, 0.791483612816729]
learning_rate:0.05, n_estimators: 2000, max_depth: 15
Test number 7 [0.7594045615910867, 0.8405407459169181]
learning_rate:0.03, n_estimators: 3000, max_depth: 15
Test number 8 [0.7604453366231851, 0.8367237780580887]
learning_rate:0.05, n_estim

In [21]:
#learning_rate:0.03, n_estimators: 3000, max_depth: 15
lgbm_params = lgb.LGBMClassifier(learning_rate=0.03, n_estimators= 3000, max_depth= 15, verbosity = -1)
lgbm_params.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, lgbm_params.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, lgbm_params.predict_proba(X_test)[:, 1]))

Train 0.8367237780580887
Test 0.7604453366231851


In [23]:
# learning_rate:0.03, n_estimators: 3000, max_depth: 12
lgbm_params2 = lgb.LGBMClassifier(learning_rate=0.03, n_estimators= 3000, max_depth= 12, verbosity = -1)
lgbm_params2.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, lgbm_params2.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, lgbm_params2.predict_proba(X_test)[:, 1]))

Train 0.8350764351305816
Test 0.7604433675285278


In [25]:
import random
params =[
    [0.01, 0.03, 0.05],
    [1000, 2000, 3000],
    [5, 9, 12]
]
num_iterations = 15
param_nums=[]
roc_aucs=[]
for i in range(num_iterations):
    learning_rate = random.choice(params[0])
    n_estimators = random.choice(params[1])
    max_depth = random.choice(params[2])
    
    xgb_search = xgb.XGBClassifier(learning_rate = learning_rate,
                         n_estimators = n_estimators,
                         max_depth = max_depth
                        )

    xgb_search.fit(X_train, y_train)
    print(f'learning_rate:{learning_rate}, n_estimators: {n_estimators}, max_depth: {max_depth}')
    res = [(roc_auc_score(y_test, xgb_search.predict_proba(X_test)[:, 1])), (roc_auc_score(y_train, xgb_search.predict_proba(X_train)[:, 1]))]
    print('Test number', i, res)
    roc_aucs.append(res)
    param_nums.append([learning_rate, n_estimators,max_depth,res])
    
print(f'Лучший результат: {max(b[0] for b in roc_aucs)}')

learning_rate:0.03, n_estimators: 2000, max_depth: 5
Test number 0 [0.7594931922798909, 0.7930977056588887]
learning_rate:0.01, n_estimators: 3000, max_depth: 9
Test number 1 [0.7619943748130169, 0.8734494269699737]
learning_rate:0.05, n_estimators: 1000, max_depth: 12
Test number 2 [0.7422257477673508, 0.9830276169344443]
learning_rate:0.01, n_estimators: 1000, max_depth: 9
Test number 3 [0.7583111766209832, 0.8275908512895171]
learning_rate:0.01, n_estimators: 3000, max_depth: 12
Test number 4 [0.7555114815781554, 0.9656475619924184]
learning_rate:0.05, n_estimators: 1000, max_depth: 12
Test number 5 [0.7422257477673508, 0.9830276169344443]
learning_rate:0.03, n_estimators: 3000, max_depth: 5
Test number 6 [0.7603930504370214, 0.8043179578179689]
learning_rate:0.05, n_estimators: 2000, max_depth: 9
Test number 7 [0.7528555778087725, 0.9463126717597031]
learning_rate:0.05, n_estimators: 3000, max_depth: 5
Test number 8 [0.760756571764475, 0.8227655752248237]
learning_rate:0.03, n_esti

In [27]:
#learning_rate:0.05, n_estimators: 3000, max_depth: 5
xgbc_params = xgb.XGBClassifier(learning_rate=0.05, n_estimators= 3000, max_depth= 5)
xgbc_params.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, xgbc_params.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, xgbc_params.predict_proba(X_test)[:, 1]))

Train 0.8227655752248237
Test 0.760756571764475


In [28]:
#learning_rate:0.01, n_estimators: 3000, max_depth: 9
xgbc_params2 = xgb.XGBClassifier(learning_rate=0.01, n_estimators= 3000, max_depth= 9)
xgbc_params2.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, xgbc_params2.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, xgbc_params2.predict_proba(X_test)[:, 1]))\



Train 0.8734494269699737
Test 0.7619943748130169


In [29]:
params =[
    [0.01, 0.03, 0.05, 0.07, 0.1],
    [500, 1000, 2000],
    [5,9,12,15]
]
num_iterations = 15
param_nums=[]
roc_aucs=[]
for i in range(num_iterations):
    learning_rate = random.choice(params[0])
    iterations = random.choice(params[1])
    depth = random.choice(params[2])
    #loss_function = random.choice(params[3])
    
    catb_search = CatBoostClassifier(learning_rate = learning_rate,
                         iterations = iterations,
                         depth = depth,
                         random_state = 42,
                         verbose = 0
                        )

    catb_search.fit(X_train, y_train)
    print(f'learning_rate:{learning_rate}, iterations: {iterations}, depth: {depth}')
    res = [(roc_auc_score(y_test, catb_search.predict_proba(X_test)[:, 1])), (roc_auc_score(y_train, catb_search.predict_proba(X_train)[:, 1]))]
    print('Test number', i, res)
    roc_aucs.append(res)
    param_nums.append([learning_rate, iterations,depth, res])
    
print(f'Лучший результат: {max(b[0] for b in roc_aucs)}')

learning_rate:0.1, iterations: 500, depth: 5
Test number 0 [0.7533464541898325, 0.7599912105345181]
learning_rate:0.03, iterations: 2000, depth: 9
Test number 1 [0.7593660335391406, 0.7866381481822466]
learning_rate:0.03, iterations: 1000, depth: 9
Test number 2 [0.7562807046700771, 0.7732312004121744]
learning_rate:0.03, iterations: 1000, depth: 5
Test number 3 [0.7515611758196599, 0.7562057071691959]
learning_rate:0.01, iterations: 2000, depth: 9
Test number 4 [0.7546040166935473, 0.7676922705198728]
learning_rate:0.07, iterations: 500, depth: 15
Test number 5 [0.7482513010017051, 0.9397600186951003]
learning_rate:0.07, iterations: 1000, depth: 5
Test number 6 [0.7549494793892466, 0.7629455751852985]
learning_rate:0.05, iterations: 2000, depth: 9
Test number 7 [0.7605474138700651, 0.801207124105541]
learning_rate:0.07, iterations: 500, depth: 9
Test number 8 [0.7565585508785312, 0.7753096896233471]
learning_rate:0.07, iterations: 500, depth: 15
Test number 9 [0.7482513010017051, 0.93

In [37]:
#learning_rate:0.07, iterations: 2000, depth: 9
catb_params = CatBoostClassifier(random_state=42, verbose = 0, learning_rate=0.07, iterations= 2000, depth= 9)
catb_params.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, catb_params.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, catb_params.predict_proba(X_test)[:, 1]))

Train 0.8134247039884972
Test 0.7606514806261913


In [39]:
# learning_rate:0.05, iterations: 2000, depth: 9
catb_params2 = CatBoostClassifier(random_state=42, verbose = 0, learning_rate=0.05, iterations= 2000, depth= 9)
catb_params2.fit(X_train, y_train)
print('Train', roc_auc_score(y_train, catb_params2.predict_proba(X_train)[:, 1]))
print('Test', roc_auc_score(y_test, catb_params2.predict_proba(X_test)[:, 1]))

Train 0.801207124105541
Test 0.7605474138700651


In [43]:
# Балансировка - Downsampling

df_min = df_train[df_train['flag'] == 1]
df_maj = df_train[df_train['flag'] == 0]

df_maj_downsample = resample(df_maj, replace=False, n_samples=len(df_min)*10, random_state=42)

print(df_maj_downsample.shape)
print(df_min.shape)
df_2 = pd.concat([df_maj_downsample, df_min], ignore_index=True).sample(frac=1.)

X_train_down, y_train_down = df_2.drop('flag', axis = 1), df_2['flag']

(851540, 315)
(85154, 315)


In [45]:
xgbc_down = xgb.XGBClassifier()
xgbc_down.fit(X_train_down, y_train_down)
print('Train', roc_auc_score(y_train_down, xgbc_down.predict_proba(X_train_down)[:, 1]))
print('Test', roc_auc_score(y_test, xgbc_down.predict_proba(X_test)[:, 1]))

Train 0.7927692500992204
Test 0.7548625267147179


In [47]:
lgbm_down = lgb.LGBMClassifier()
lgbm_down.fit(X_train_down, y_train_down)
print('Train', roc_auc_score(y_train_down, lgbm_down.predict_proba(X_train_down)[:, 1]))
print('Test', roc_auc_score(y_test, lgbm_down.predict_proba(X_test)[:, 1]))

Train 0.7651910130806205
Test 0.7541398423107472


**Комментарий**

Видно, что downsampling не дает никаких значимых улучшений на сетях с дефолтными параметрами относительно таких же сетей, обученных на полном датасете. 
Поэтому надежнее будет обучаться на полном датасете без downsampling.

In [51]:
models = [lgbm_params,  lgbm_params2, xgbc_params, xgbc_params2,catb_params, catb_params2]

In [53]:
# Создадим функцию, которая собирает predict_proba ото всех моделей в один датафрейм с target
# принимает список моделей и датасет, для которого нужно посчитать predict_proba
def concat_preds(models_list, x_data, y_data):
    df_concat = pd.DataFrame()
    df_concat['target'] = pd.DataFrame(y_data, index = y_data.index)
    for i in range(len(models_list)):
        name = 'model_' + str(i)
        df_concat[name] =  pd.DataFrame(models_list[i].predict_proba(x_data)[:, 1], index = x_data.index)
    df_concat = df_concat.drop('target', axis = 1)
    return df_concat

In [55]:
dfpred_train = concat_preds(models, X_train, y_train)
dfpred_test =  concat_preds(models, X_test, y_test)
dfpred_test

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,model_5
1679350,0.016800,0.016862,0.016114,0.016212,0.014676,0.017901
385353,0.010856,0.010227,0.010593,0.011846,0.011407,0.011933
2137282,0.057073,0.076651,0.087243,0.065408,0.085347,0.063499
896053,0.048449,0.052035,0.045918,0.044282,0.044912,0.049687
2824317,0.058286,0.057235,0.048759,0.062458,0.057794,0.058169
...,...,...,...,...,...,...
1567263,0.080318,0.079402,0.079717,0.089144,0.093278,0.111644
2359992,0.022177,0.022427,0.023625,0.018691,0.021443,0.023752
410308,0.010111,0.007110,0.005289,0.005479,0.011597,0.009659
2452516,0.010836,0.010793,0.009925,0.010305,0.012926,0.011674


In [61]:
def mean_count(row, datafr):
    probs_sum = 0
    for col in datafr.columns:
        probs_sum+=row[col]
    return  probs_sum / (datafr.shape[1])
dfpred_train['mean'] = dfpred_train.apply(lambda x: mean_count(x, dfpred_train), axis = 1)
dfpred_test['mean'] = dfpred_test.apply(lambda x: mean_count(x, dfpred_test), axis = 1)
dfpred_test.head()

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,model_5,mean
1679350,0.0168,0.016862,0.016114,0.016212,0.014676,0.017901,0.016428
385353,0.010856,0.010227,0.010593,0.011846,0.011407,0.011933,0.011143
2137282,0.057073,0.076651,0.087243,0.065408,0.085347,0.063499,0.072537
896053,0.048449,0.052035,0.045918,0.044282,0.044912,0.049687,0.047547
2824317,0.058286,0.057235,0.048759,0.062458,0.057794,0.058169,0.057117


In [62]:
print('Train', roc_auc_score(y_train, dfpred_train['mean']))
print('Test', roc_auc_score(y_test, dfpred_test['mean']))

Train 0.8417153428260082
Test 0.7637745638221034
