In [10]:
import pickle
import optuna
from eli5.sklearn import PermutationImportance
import eli5

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold
import gc
import numpy as np
import pandas as pd

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

In [11]:
N_FOLDS = 5
RAND = 10

In [12]:
def get_metrics(y_test: np.ndarray, y_pred: np.ndarray, y_score: np.ndarray, name: str) -> pd.DataFrame:
    """
    Вывод метрик классификации
    """
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]

    df_metrics['Accuracy'] = [accuracy_score(y_test, y_pred)]
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score[:, 1])]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Logloss'] = [log_loss(y_test, y_score)]
    df_metrics['amex'] = [amex_metric(y_test, y_score[:, 1])]

    return df_metrics

In [13]:
def amex_metric(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Вычисление метрики соревнования
    """

    if isinstance(y_true, np.ndarray):
        y_true = pd.DataFrame(y_true, columns=["target"])

    if isinstance(y_pred, np.ndarray):
        y_pred = pd.DataFrame(y_pred, columns=["prediction"])

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))

        df['weight'] = df["target"].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df["target"] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df["target"].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df["target"] * df['weight']).sum()
        df['cum_pos_found'] = (df["target"] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    d = top_four_percent_captured(y_true, y_pred)
    g = normalized_weighted_gini(y_true, y_pred)

    return 0.5 * (g + d)

In [14]:
def custom_xg_amex_metric(y_pred: np.ndarray, dtrain):
    """
    eval metric для xgboost
    """
    y_true = dtrain.get_label()
    score = amex_metric(y_true, y_pred)
    return 'amex', -score

In [15]:
def custom_lg_amex_metric(y_true: np.ndarray, y_pred: np.ndarray):
    """
    eval metric для lgbm
    """
    score = amex_metric(y_true, y_pred)
    return 'custom_lg_amex_metric', score, True

In [16]:
class CatBoostEvalMetricCustom(object):
    """
    eval metric для catbost
    """
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        # the larger metric value the better
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        score = amex_metric(target, preds)
        return score, 0

In [17]:
def check_overfitting(model, X_train, y_train, X_test, y_test):
    """
    Проверка на overfitting
    """
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_score_train = model.predict_proba(X_train)
    y_score_test = model.predict_proba(X_test)

    print(f'f1 train: %.3f' % f1_score(y_train, y_pred_train))
    print(f'f1 test: %.3f' % f1_score(y_test, y_pred_test))

    print(f'roc-auc train: %.3f' % roc_auc_score(y_train, y_score_train[:, 1]))
    print(f'roc-auc test: %.3f' % roc_auc_score(y_test, y_score_test[:, 1]))

In [18]:
train_bin = pd.read_parquet('train_bin_3.parquet')
train_data_grouped = pd.read_parquet('train_data_grouped_3.parquet')

In [19]:
train_bin.fillna(method="ffill", inplace=True)
train_bin.fillna(method="bfill", inplace=True)

In [20]:
train_data_grouped.fillna(method="ffill", inplace=True)
train_data_grouped.fillna(method="bfill", inplace=True)

In [21]:
# бинаризованные данные
X_bin = train_bin.drop(columns='target')
y = train_bin['target']
X_train_bin, X_test_bin, y_train, y_test = train_test_split(
    X_bin, y,
    stratify=y,
    shuffle=True,
    test_size=0.25,
    random_state=RAND)
X_train_bin_, X_val_bin, y_train_, y_val = train_test_split(X_train_bin,
                                                            y_train,
                                                            shuffle=True,
                                                            test_size=0.16,
                                                            random_state=RAND)


# без бинаризации
X = train_data_grouped.drop(columns='target')
X_train = X.loc[X_train_bin.index]
X_test = X.loc[X_test_bin.index]
X_train_ = X_train.loc[X_train_bin_.index]
X_val = X_train.loc[X_val_bin.index]

### Baseline

#### LogisticRegression

In [22]:
lr = LogisticRegression(class_weight='balanced', random_state=RAND)
lr.fit(X_train_bin, y_train)

y_pred = lr.predict(X_test_bin)
y_score = lr.predict_proba(X_test_bin)

metrics = get_metrics(y_test.values, y_pred,
                      y_score, name='LogisticRegression_Baseline')
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476


In [23]:
check_overfitting(lr, X_train_bin, y_train, X_test_bin, y_test)

f1 train: 0.797
f1 test: 0.799
roc-auc train: 0.954
roc-auc test: 0.954


#### Random Forest

In [24]:
rf = RandomForestClassifier(
    random_state=RAND, class_weight='balanced', n_jobs=-1)
rf.fit(X_train_bin, y_train)

y_pred = rf.predict(X_test_bin)
y_score = rf.predict_proba(X_test_bin)

metrics = metrics.append(get_metrics(y_test.values, y_pred,
                                     y_score, name='RandomForest_Baseline'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461


In [25]:
check_overfitting(rf, X_train_bin, y_train, X_test_bin, y_test)

f1 train: 1.000
f1 test: 0.784
roc-auc train: 1.000
roc-auc test: 0.954


По метрикам можно заметить явное переобучение у Random Forest

In [15]:
perm = PermutationImportance(rf, random_state=RAND, n_iter=2)
perm.fit(X_train_bin, y_train)
eli5.show_weights(perm, feature_names=X_train_bin.columns.tolist())

Weight,Feature
0.0120  ± 0.0001,P_2_last
0.0032  ± 0.0001,P_2_min
0.0023  ± 0.0000,P_2_mean
0.0020  ± 0.0000,B_9_last
0.0014  ± 0.0001,B_1_last
0.0014  ± 0.0001,P_2_max
0.0013  ± 0.0000,D_44_last
0.0012  ± 0.0000,B_2_last
0.0010  ± 0.0001,B_7_last
0.0008  ± 0.0001,R_1_last


#### XGBoost

In [26]:
ratio = float(np.sum(
    train_bin['target'] == 0)) / np.sum(train_bin['target'] == 1)

In [27]:
xg = XGBClassifier(scale_pos_weight=ratio, random_state=RAND)
eval_set = [(X_val_bin, y_val)]
xg.fit(X_train_bin_, y_train_,
       eval_set=eval_set,
       verbose=False,
       eval_metric=custom_xg_amex_metric,
       early_stopping_rounds=100)

y_pred = xg.predict(X_test_bin)
y_score = xg.predict_proba(X_test_bin)

metrics = metrics.append(get_metrics(y_test.values, y_pred,
                                     y_score, name='XGBoost_Baseline'))
metrics



Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
0,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851


In [28]:
check_overfitting(xg, X_train_bin, y_train, X_test_bin, y_test)

f1 train: 0.821
f1 test: 0.802
roc-auc train: 0.968
roc-auc test: 0.958


#### LightGBM

In [29]:
eval_set = [(X_val, y_val)]

In [30]:
lg = LGBMClassifier(scale_pos_weight=ratio, random_state=RAND)
lg.fit(X_train_, y_train_,
       eval_set=eval_set,
       verbose=False,
       eval_metric=custom_lg_amex_metric,
       early_stopping_rounds=100)

y_pred = lg.predict(X_test)
y_score = lg.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test.values, y_pred,
                                     y_score, name='LGBM_Baseline'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
0,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851
0,LGBM_Baseline,0.881181,0.959101,0.708026,0.92086,0.800538,0.261274,0.78352


In [31]:
check_overfitting(lg, X_train, y_train, X_test, y_test)

f1 train: 0.805
f1 test: 0.801
roc-auc train: 0.963
roc-auc test: 0.959


#### Catboost

In [32]:
cat_feat = X_train.select_dtypes('category').columns.tolist()

In [33]:
cat = CatBoostClassifier(random_state=RAND,
                         scale_pos_weight=ratio,
                         eval_metric=CatBoostEvalMetricCustom(),
                         cat_features=cat_feat)
cat.fit(X_train_, y_train_,
        eval_set=eval_set,
        verbose=False,
        early_stopping_rounds=100)

y_pred = cat.predict(X_test)
y_score = cat.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test.values, y_pred,
                                     y_score, name='CatBoost_Baseline'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
0,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851
0,LGBM_Baseline,0.881181,0.959101,0.708026,0.92086,0.800538,0.261274,0.78352
0,CatBoost_Baseline,0.887884,0.959946,0.72538,0.912445,0.80823,0.2536,0.784212


In [34]:
check_overfitting(cat, X_train, y_train, X_test, y_test)

f1 train: 0.829
f1 test: 0.808
roc-auc train: 0.970
roc-auc test: 0.960


### Подбор параметров

После получения бейзлайна было принято решение подобрать параметры для Catboost и LGBM, а затем сделать ручной стекинг моделей catboost, lgbm c финальной моделью logistic regression. 

Подбор параметров осуществлялся с помощью библиотеки Optuna.

#### CatBoost

In [34]:
def objective_cat(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 6000, step=500),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [2.7971]),
        "eval_metric": CatBoostEvalMetricCustom(),
        "random_state": random_state
    }

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict_proba(X_test.values)
        cv_predicts[idx] = amex_metric(y_test.values, preds[:, 1])
    return np.mean(cv_predicts)

In [35]:
study_cat = optuna.create_study(direction="maximize", study_name="CAT")


def func(trial): return objective_cat(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_feat)


study_cat.optimize(func, n_trials=5, show_progress_bar=True)

[32m[I 2022-09-03 15:51:11,443][0m A new study created in memory with name: CAT[0m


  0%|          | 0/5 [00:00<?, ?it/s]

[32m[I 2022-09-03 16:22:21,107][0m Trial 0 finished with value: 0.7860718073265622 and parameters: {'n_estimators': 1500, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7860718073265622.[0m
[32m[I 2022-09-03 17:01:06,704][0m Trial 1 finished with value: 0.7857187833739177 and parameters: {'n_estimators': 6000, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7860718073265622.[0m
[32m[I 2022-09-03 17:32:42,856][0m Trial 2 finished with value: 0.7860718073265622 and parameters: {'n_estimators': 1500, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7860718073265622.[0m
[32m[I 2022-09-03 18:16:31,229][0m Trial 3 finished with value: 0.785470297957661 and parameters: {'n_estimators': 5000, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7860718073265622.[0m
[32m[I 2022-09-03 18:55:35,692][0m Trial 4 finished with value: 0.7858459120487378 and parameters: {'n_estimators': 5500, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 

In [36]:
print(f"\tBest value (amex): {study_cat.best_value:.5f}")
print(f"\tBest params:")

for key, value in study_cat.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (amex): 0.78607
	Best params:
		n_estimators: 1500
		scale_pos_weight: 2.7971


In [14]:
def objective_cat(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators":
        trial.suggest_categorical("n_estimators", [1500]),
        "learning_rate":
        trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg":
        trial.suggest_uniform("l2_leaf_reg", 1e-5, 1e2),
        "scale_pos_weight":
        trial.suggest_categorical("scale_pos_weight", [2.7971]),
        "eval_metric": CatBoostEvalMetricCustom(),
        "random_state":
        random_state
    }

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data, eval_set=eval_data,
                  early_stopping_rounds=100, verbose=0)

        preds = model.predict_proba(X_test)
        cv_predicts[idx] = amex_metric(y_test.values, preds[:, 1])
    return np.mean(cv_predicts)

In [18]:
study_cat = optuna.create_study(direction="maximize", study_name="CAT")


def func(trial): return objective_cat(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_feat)


study_cat.optimize(func, n_trials=6, show_progress_bar=True)

[32m[I 2022-09-05 10:15:08,709][0m A new study created in memory with name: CAT[0m


  0%|          | 0/6 [00:00<?, ?it/s]

[32m[I 2022-09-05 10:30:27,020][0m Trial 0 finished with value: 0.7825432442587201 and parameters: {'n_estimators': 1500, 'learning_rate': 0.29380977563146177, 'l2_leaf_reg': 47.76055040187856, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7825432442587201.[0m
[32m[I 2022-09-05 10:45:53,028][0m Trial 1 finished with value: 0.7827937261676883 and parameters: {'n_estimators': 1500, 'learning_rate': 0.27786553073545717, 'l2_leaf_reg': 6.417439662214108, 'scale_pos_weight': 2.7971}. Best is trial 1 with value: 0.7827937261676883.[0m
[32m[I 2022-09-05 11:01:22,502][0m Trial 2 finished with value: 0.7839691607606913 and parameters: {'n_estimators': 1500, 'learning_rate': 0.24638367391819627, 'l2_leaf_reg': 44.18256107098331, 'scale_pos_weight': 2.7971}. Best is trial 2 with value: 0.7839691607606913.[0m
[32m[I 2022-09-05 11:22:17,201][0m Trial 3 finished with value: 0.7853368307235217 and parameters: {'n_estimators': 1500, 'learning_rate': 0.14368320308892796, 'l2_lea

In [19]:
study_cat.optimize(func, n_trials=2, show_progress_bar=True)

  0%|          | 0/2 [00:00<?, ?it/s]

[32m[I 2022-09-05 13:02:03,157][0m Trial 6 finished with value: 0.7831263764665102 and parameters: {'n_estimators': 1500, 'learning_rate': 0.269021853975694, 'l2_leaf_reg': 24.495710830253117, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7860954610908688.[0m
[32m[I 2022-09-05 13:28:55,872][0m Trial 7 finished with value: 0.7852732100477003 and parameters: {'n_estimators': 1500, 'learning_rate': 0.12450714908433726, 'l2_leaf_reg': 13.883620088800601, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7860954610908688.[0m


In [41]:
print(f"\tBest value (amex metric): {study_cat.best_value:.5f}")
print(f"\tBest params:")

for key, value in study_cat.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (amex metric): 0.78626
	Best params:
		n_estimators: 1500
		learning_rate: 0.131187922183935
		l2_leaf_reg: 12.752475266490995
		leaf_estimation_iterations: 6
		scale_pos_weight: 2.7971


In [20]:
study_cat.best_params

{'n_estimators': 1500,
 'learning_rate': 0.03880660425195467,
 'l2_leaf_reg': 79.87185969430031,
 'scale_pos_weight': 2.7971}

In [35]:
cat_params = {
    'n_estimators': 1500,
    'learning_rate': 0.03880660425195467,
    'l2_leaf_reg': 79.87185969430031,
    'scale_pos_weight': 2.861,
    'eval_metric': CatBoostEvalMetricCustom(),
    'random_state': RAND
}

In [37]:
cat_optuna = CatBoostClassifier(**cat_params)
                               
cat_optuna.fit(X_train_,
               y_train_,
               cat_features=cat_feat,
               eval_set=eval_set,
               verbose=False,
               early_stopping_rounds=100)

y_pred = cat_optuna.predict(X_test)
y_score = cat_optuna.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test.values, y_pred, 
                                     y_score, name='CatBoost_Optuna'))

In [38]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
0,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851
0,LGBM_Baseline,0.881181,0.959101,0.708026,0.92086,0.800538,0.261274,0.78352
0,CatBoost_Baseline,0.887884,0.959946,0.72538,0.912445,0.80823,0.2536,0.784212
0,CatBoost_Optuna,0.884388,0.96006,0.714857,0.920793,0.804861,0.258429,0.786262


In [42]:
filename = 'cat_optuna.sav'
pickle.dump(cat_optuna, open(filename, 'wb'))

In [87]:
check_overfitting(cat_optuna, X_train, y_train, X_test, y_test)

f1 train: 0.808
f1 test: 0.805
roc-auc train: 0.962
roc-auc test: 0.960


#### LightGBM

In [65]:
def objective_lgb(trial, X, y, N_FOLDS, random_state):
    lgb_params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 6000, step=500),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [2.7971]),
        "random_state": random_state
    }

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True,
                         random_state=random_state)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        lg = LGBMClassifier(**lgb_params)
        lg.fit(X_train,
               y_train,
               eval_metric=custom_lg_amex_metric,
               eval_set=[(X_test, y_test)],
               early_stopping_rounds=100,
               verbose=0)

        preds = lg.predict_proba(X_test)
        cv_predicts[idx] = amex_metric(y_test.values, preds[:, 1])

    return np.mean(cv_predicts)

In [66]:
study_lgb = optuna.create_study(direction="maximize", study_name="lgb")


def func(trial): return objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)


study_lgb.optimize(func, n_trials=2, show_progress_bar=True)

[32m[I 2022-09-06 14:48:00,115][0m A new study created in memory with name: lgb[0m


  0%|          | 0/2 [00:00<?, ?it/s]

[32m[I 2022-09-06 14:51:10,888][0m Trial 0 finished with value: 0.7839919517601184 and parameters: {'n_estimators': 2500, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7839919517601184.[0m
[32m[I 2022-09-06 14:54:19,658][0m Trial 1 finished with value: 0.7839919517601184 and parameters: {'n_estimators': 4000, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7839919517601184.[0m


In [67]:
print(f"\tBest value (amex): {study_lgb.best_value:.5f}")
print(f"\tBest params:")

for key, value in study_lgb.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (amex): 0.78399
	Best params:
		n_estimators: 2500
		scale_pos_weight: 2.7971


In [68]:
def objective_lgb(trial, X, y, N_FOLDS, random_state):
    lgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [1500]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
        "scale_pos_weight":
        trial.suggest_categorical("scale_pos_weight", [2.7971]),
        "random_state": random_state
    }

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True,
                         random_state=random_state)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        lg = LGBMClassifier(**lgb_params, n_jobs=-1)
        lg.fit(X_train,
               y_train,
               eval_metric=custom_lg_amex_metric,
               eval_set=[(X_test, y_test)],
               early_stopping_rounds=100,
               verbose=0)

        preds = lg.predict_proba(X_test)
        cv_predicts[idx] = amex_metric(y_test.values, preds[:, 1])
    return np.mean(cv_predicts)

In [69]:
study_lgb = optuna.create_study(direction="maximize", study_name="lgb")


def func(trial): return objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)


study_lgb.optimize(func, n_trials=10, show_progress_bar=True)

[32m[I 2022-09-06 14:55:36,470][0m A new study created in memory with name: lgb[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2022-09-06 14:58:48,216][0m Trial 0 finished with value: 0.7774589162213543 and parameters: {'n_estimators': 1500, 'learning_rate': 0.2906812123100373, 'num_leaves': 2500, 'max_depth': 11, 'min_data_in_leaf': 6400, 'max_bin': 253, 'lambda_l1': 95, 'lambda_l2': 35, 'min_gain_to_split': 9.991756062643026, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.8, 'scale_pos_weight': 2.7971}. Best is trial 0 with value: 0.7774589162213543.[0m


[32m[I 2022-09-06 15:02:26,914][0m Trial 1 finished with value: 0.7792480683185289 and parameters: {'n_estimators': 1500, 'learning_rate': 0.20408781484167854, 'num_leaves': 3000, 'max_depth': 7, 'min_data_in_leaf': 3100, 'max_bin': 271, 'lambda_l1': 85, 'lambda_l2': 60, 'min_gain_to_split': 14.717431079246522, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4, 'scale_pos_weight': 2.7971}. Best is trial 1 with value: 0.7792480683185289.[0m
[32m[I 2022-09-06 15:06:30,368][0m Trial 2 finished with value: 0.7786708966750441 and parameters: {'n_estimators': 1500, 'learning_rate': 0.1606220192794014, 'num_leaves': 60, 'max_depth': 7, 'min_data_in_leaf': 7900, 'max_bin': 284, 'lambda_l1': 45, 'lambda_l2': 20, 'min_gain_to_split': 12.979518692104142, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.5, 'scale_pos_weight': 2.7971}. Best is trial 1 with value: 0.7792480683185289.[0m


[32m[I 2022-09-06 15:13:52,336][0m Trial 3 finished with value: 0.7840331143942321 and parameters: {'n_estimators': 1500, 'learning_rate': 0.04766138222880641, 'num_leaves': 1460, 'max_depth': 9, 'min_data_in_leaf': 7700, 'max_bin': 227, 'lambda_l1': 40, 'lambda_l2': 60, 'min_gain_to_split': 4.599800997237139, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.2, 'scale_pos_weight': 2.7971}. Best is trial 3 with value: 0.7840331143942321.[0m
[32m[I 2022-09-06 15:22:11,023][0m Trial 4 finished with value: 0.7838209906125501 and parameters: {'n_estimators': 1500, 'learning_rate': 0.04876875843081777, 'num_leaves': 2980, 'max_depth': 4, 'min_data_in_leaf': 8600, 'max_bin': 240, 'lambda_l1': 20, 'lambda_l2': 40, 'min_gain_to_split': 0.2392398301358345, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.4, 'scale_pos_weight': 2.7971}. Best is trial 3 with value: 0.7840331143942321.[0m


[32m[I 2022-09-06 15:28:16,993][0m Trial 5 finished with value: 0.7845067564405482 and parameters: {'n_estimators': 1500, 'learning_rate': 0.060637787392848454, 'num_leaves': 1400, 'max_depth': 7, 'min_data_in_leaf': 1800, 'max_bin': 250, 'lambda_l1': 35, 'lambda_l2': 55, 'min_gain_to_split': 8.858659446060356, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.2, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7845067564405482.[0m
[32m[I 2022-09-06 15:37:44,780][0m Trial 6 finished with value: 0.7787462167197505 and parameters: {'n_estimators': 1500, 'learning_rate': 0.03752961385280169, 'num_leaves': 1640, 'max_depth': 10, 'min_data_in_leaf': 8500, 'max_bin': 207, 'lambda_l1': 60, 'lambda_l2': 35, 'min_gain_to_split': 11.212955298075196, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.9, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7845067564405482.[0m


[32m[I 2022-09-06 15:42:40,968][0m Trial 7 finished with value: 0.7787050390569223 and parameters: {'n_estimators': 1500, 'learning_rate': 0.10895225335220222, 'num_leaves': 2500, 'max_depth': 3, 'min_data_in_leaf': 6900, 'max_bin': 242, 'lambda_l1': 80, 'lambda_l2': 35, 'min_gain_to_split': 2.989552229413193, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.2, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7845067564405482.[0m
[32m[I 2022-09-06 15:47:19,285][0m Trial 8 finished with value: 0.7793250171989528 and parameters: {'n_estimators': 1500, 'learning_rate': 0.15290548615771435, 'num_leaves': 2160, 'max_depth': 6, 'min_data_in_leaf': 6600, 'max_bin': 243, 'lambda_l1': 40, 'lambda_l2': 60, 'min_gain_to_split': 10.625931297378965, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.2, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7845067564405482.[0m


[32m[I 2022-09-06 15:51:23,013][0m Trial 9 finished with value: 0.7836614757243543 and parameters: {'n_estimators': 1500, 'learning_rate': 0.1400361195952957, 'num_leaves': 2180, 'max_depth': 12, 'min_data_in_leaf': 3400, 'max_bin': 236, 'lambda_l1': 35, 'lambda_l2': 15, 'min_gain_to_split': 5.116791589891137, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.5, 'scale_pos_weight': 2.7971}. Best is trial 5 with value: 0.7845067564405482.[0m


In [71]:
print(f"\tBest value (amex): {study_lgb.best_value:.5f}")
print(f"\tBest params:")

for key, value in study_lgb.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (amex): 0.78451
	Best params:
		n_estimators: 1500
		learning_rate: 0.060637787392848454
		num_leaves: 1400
		max_depth: 7
		min_data_in_leaf: 1800
		max_bin: 250
		lambda_l1: 35
		lambda_l2: 55
		min_gain_to_split: 8.858659446060356
		bagging_fraction: 0.8
		bagging_freq: 1
		feature_fraction: 0.2
		scale_pos_weight: 2.7971


In [74]:
study_lgb.best_params

{'n_estimators': 1500,
 'learning_rate': 0.060637787392848454,
 'num_leaves': 1400,
 'max_depth': 7,
 'min_data_in_leaf': 1800,
 'max_bin': 250,
 'lambda_l1': 35,
 'lambda_l2': 55,
 'min_gain_to_split': 8.858659446060356,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'feature_fraction': 0.2,
 'scale_pos_weight': 2.7971}

In [39]:
lg_params = {
    'n_estimators': 1500,
    'learning_rate': 0.060637787392848454,
    'num_leaves': 1400,
    'max_depth': 7,
    'min_data_in_leaf': 1800,
    'max_bin': 250,
    'lambda_l1': 35,
    'lambda_l2': 55,
    'min_gain_to_split': 8.858659446060356,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'feature_fraction': 0.2,
    'scale_pos_weight': 2.7971
}

In [40]:
lg_optuna = LGBMClassifier(**lg_params)
lg_optuna.fit(X_train_, y_train_,
              eval_set=eval_set,
              verbose=False,
              eval_metric=custom_lg_amex_metric,
              early_stopping_rounds=100)

y_pred = lg_optuna.predict(X_test)
y_score = lg_optuna.predict_proba(X_test)

metrics = metrics.append(get_metrics(y_test.values, y_pred,
                                     y_score, name='LGBM_Optuna'))
metrics



Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
0,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
0,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851
0,LGBM_Baseline,0.881181,0.959101,0.708026,0.92086,0.800538,0.261274,0.78352
0,CatBoost_Baseline,0.887884,0.959946,0.72538,0.912445,0.80823,0.2536,0.784212
0,CatBoost_Optuna,0.884388,0.96006,0.714857,0.920793,0.804861,0.258429,0.786262
0,LGBM_Optuna,0.883953,0.959533,0.714821,0.9181,0.803808,0.258686,0.783749


In [43]:
filename = 'lg_optuna.sav'
pickle.dump(lg_optuna, open(filename, 'wb'))

In [88]:
check_overfitting(lg_optuna, X_train, y_train, X_test, y_test)

f1 train: 0.806
f1 test: 0.804
roc-auc train: 0.961
roc-auc test: 0.960


### Стекинг

#### Catboost

In [53]:
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()

pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model = CatBoostClassifier(**cat_params, cat_features=cat_feat)

    train_data = Pool(data=X_train_, label=y_train_, cat_features=cat_feat)
    eval_data = Pool(data=X_val, label=y_val, cat_features=cat_feat)

    model.fit(train_data,
              eval_set=eval_data,
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_score_val = model.predict_proba(X_val)

    print("Fold:", fold + 1,
          "amex %.3f" % amex_metric(y_val.values, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

Fold: 1 amex 0.785
---
Fold: 2 amex 0.781
---
Fold: 3 amex 0.786
---
Fold: 4 amex 0.784
---
Fold: 5 amex 0.788
---


In [54]:
model = CatBoostClassifier(**cat_params, cat_features=cat_feat)
model.fit(X_train, y_train)

0:	learn: 0.6540242	total: 748ms	remaining: 18m 41s
1:	learn: 0.6820336	total: 1.43s	remaining: 17m 51s
2:	learn: 0.6924801	total: 2.12s	remaining: 17m 36s
3:	learn: 0.6988763	total: 2.81s	remaining: 17m 30s
4:	learn: 0.7074218	total: 3.53s	remaining: 17m 34s
5:	learn: 0.7065983	total: 4.25s	remaining: 17m 37s
6:	learn: 0.7137614	total: 4.95s	remaining: 17m 36s
7:	learn: 0.7185483	total: 5.65s	remaining: 17m 33s
8:	learn: 0.7223040	total: 6.35s	remaining: 17m 32s
9:	learn: 0.7250234	total: 7.07s	remaining: 17m 32s
10:	learn: 0.7283030	total: 7.78s	remaining: 17m 32s
11:	learn: 0.7290827	total: 8.49s	remaining: 17m 33s
12:	learn: 0.7306880	total: 9.21s	remaining: 17m 33s
13:	learn: 0.7324371	total: 9.92s	remaining: 17m 32s
14:	learn: 0.7326194	total: 10.6s	remaining: 17m 31s
15:	learn: 0.7329800	total: 11.3s	remaining: 17m 30s
16:	learn: 0.7336731	total: 12s	remaining: 17m 30s
17:	learn: 0.7345077	total: 12.8s	remaining: 17m 32s
18:	learn: 0.7352006	total: 13.5s	remaining: 17m 32s
19:	l

154:	learn: 0.7736080	total: 1m 49s	remaining: 15m 49s
155:	learn: 0.7734806	total: 1m 50s	remaining: 15m 48s
156:	learn: 0.7737769	total: 1m 50s	remaining: 15m 48s
157:	learn: 0.7737959	total: 1m 51s	remaining: 15m 47s
158:	learn: 0.7738639	total: 1m 52s	remaining: 15m 46s
159:	learn: 0.7739926	total: 1m 52s	remaining: 15m 45s
160:	learn: 0.7738215	total: 1m 53s	remaining: 15m 45s
161:	learn: 0.7739630	total: 1m 54s	remaining: 15m 44s
162:	learn: 0.7741361	total: 1m 55s	remaining: 15m 43s
163:	learn: 0.7741465	total: 1m 55s	remaining: 15m 42s
164:	learn: 0.7741608	total: 1m 56s	remaining: 15m 42s
165:	learn: 0.7741282	total: 1m 57s	remaining: 15m 41s
166:	learn: 0.7741847	total: 1m 57s	remaining: 15m 40s
167:	learn: 0.7743962	total: 1m 58s	remaining: 15m 39s
168:	learn: 0.7744340	total: 1m 59s	remaining: 15m 39s
169:	learn: 0.7746007	total: 1m 59s	remaining: 15m 38s
170:	learn: 0.7746075	total: 2m	remaining: 15m 37s
171:	learn: 0.7746883	total: 2m 1s	remaining: 15m 36s
172:	learn: 0.7

304:	learn: 0.7829044	total: 3m 36s	remaining: 14m 6s
305:	learn: 0.7829507	total: 3m 36s	remaining: 14m 5s
306:	learn: 0.7830015	total: 3m 37s	remaining: 14m 5s
307:	learn: 0.7831026	total: 3m 38s	remaining: 14m 4s
308:	learn: 0.7831376	total: 3m 38s	remaining: 14m 3s
309:	learn: 0.7831596	total: 3m 39s	remaining: 14m 2s
310:	learn: 0.7831945	total: 3m 40s	remaining: 14m 2s
311:	learn: 0.7833035	total: 3m 40s	remaining: 14m 1s
312:	learn: 0.7831950	total: 3m 41s	remaining: 14m
313:	learn: 0.7832648	total: 3m 42s	remaining: 13m 59s
314:	learn: 0.7833729	total: 3m 43s	remaining: 13m 59s
315:	learn: 0.7834154	total: 3m 43s	remaining: 13m 58s
316:	learn: 0.7835063	total: 3m 44s	remaining: 13m 57s
317:	learn: 0.7834660	total: 3m 45s	remaining: 13m 56s
318:	learn: 0.7835672	total: 3m 45s	remaining: 13m 56s
319:	learn: 0.7836898	total: 3m 46s	remaining: 13m 55s
320:	learn: 0.7837277	total: 3m 47s	remaining: 13m 54s
321:	learn: 0.7837881	total: 3m 47s	remaining: 13m 53s
322:	learn: 0.7838620	

455:	learn: 0.7884248	total: 5m 21s	remaining: 12m 16s
456:	learn: 0.7885619	total: 5m 22s	remaining: 12m 15s
457:	learn: 0.7886240	total: 5m 22s	remaining: 12m 14s
458:	learn: 0.7886417	total: 5m 23s	remaining: 12m 14s
459:	learn: 0.7886769	total: 5m 24s	remaining: 12m 13s
460:	learn: 0.7886554	total: 5m 25s	remaining: 12m 12s
461:	learn: 0.7887618	total: 5m 25s	remaining: 12m 11s
462:	learn: 0.7887021	total: 5m 26s	remaining: 12m 11s
463:	learn: 0.7886979	total: 5m 27s	remaining: 12m 10s
464:	learn: 0.7886487	total: 5m 27s	remaining: 12m 9s
465:	learn: 0.7886102	total: 5m 28s	remaining: 12m 8s
466:	learn: 0.7886344	total: 5m 29s	remaining: 12m 8s
467:	learn: 0.7886299	total: 5m 29s	remaining: 12m 7s
468:	learn: 0.7886164	total: 5m 30s	remaining: 12m 6s
469:	learn: 0.7885904	total: 5m 31s	remaining: 12m 5s
470:	learn: 0.7886346	total: 5m 31s	remaining: 12m 5s
471:	learn: 0.7885959	total: 5m 32s	remaining: 12m 4s
472:	learn: 0.7886678	total: 5m 33s	remaining: 12m 3s
473:	learn: 0.78874

606:	learn: 0.7925446	total: 7m 6s	remaining: 10m 28s
607:	learn: 0.7925068	total: 7m 7s	remaining: 10m 27s
608:	learn: 0.7925254	total: 7m 8s	remaining: 10m 26s
609:	learn: 0.7925842	total: 7m 9s	remaining: 10m 26s
610:	learn: 0.7926071	total: 7m 9s	remaining: 10m 25s
611:	learn: 0.7926534	total: 7m 10s	remaining: 10m 24s
612:	learn: 0.7927176	total: 7m 11s	remaining: 10m 23s
613:	learn: 0.7926248	total: 7m 11s	remaining: 10m 23s
614:	learn: 0.7927655	total: 7m 12s	remaining: 10m 22s
615:	learn: 0.7926864	total: 7m 13s	remaining: 10m 21s
616:	learn: 0.7927773	total: 7m 14s	remaining: 10m 21s
617:	learn: 0.7927994	total: 7m 14s	remaining: 10m 20s
618:	learn: 0.7929119	total: 7m 15s	remaining: 10m 19s
619:	learn: 0.7927820	total: 7m 16s	remaining: 10m 19s
620:	learn: 0.7928141	total: 7m 16s	remaining: 10m 18s
621:	learn: 0.7928192	total: 7m 17s	remaining: 10m 17s
622:	learn: 0.7927637	total: 7m 18s	remaining: 10m 16s
623:	learn: 0.7928011	total: 7m 18s	remaining: 10m 16s
624:	learn: 0.7

759:	learn: 0.7956429	total: 8m 53s	remaining: 8m 39s
760:	learn: 0.7957100	total: 8m 54s	remaining: 8m 39s
761:	learn: 0.7956753	total: 8m 55s	remaining: 8m 38s
762:	learn: 0.7958334	total: 8m 55s	remaining: 8m 37s
763:	learn: 0.7958823	total: 8m 56s	remaining: 8m 36s
764:	learn: 0.7958593	total: 8m 57s	remaining: 8m 36s
765:	learn: 0.7959096	total: 8m 58s	remaining: 8m 35s
766:	learn: 0.7959041	total: 8m 58s	remaining: 8m 34s
767:	learn: 0.7958980	total: 8m 59s	remaining: 8m 34s
768:	learn: 0.7958937	total: 9m	remaining: 8m 33s
769:	learn: 0.7958768	total: 9m	remaining: 8m 32s
770:	learn: 0.7958759	total: 9m 1s	remaining: 8m 31s
771:	learn: 0.7958743	total: 9m 2s	remaining: 8m 31s
772:	learn: 0.7959986	total: 9m 2s	remaining: 8m 30s
773:	learn: 0.7961184	total: 9m 3s	remaining: 8m 29s
774:	learn: 0.7960947	total: 9m 4s	remaining: 8m 29s
775:	learn: 0.7960829	total: 9m 4s	remaining: 8m 28s
776:	learn: 0.7960428	total: 9m 5s	remaining: 8m 27s
777:	learn: 0.7961885	total: 9m 6s	remainin

912:	learn: 0.7984756	total: 10m 40s	remaining: 6m 51s
913:	learn: 0.7985418	total: 10m 40s	remaining: 6m 50s
914:	learn: 0.7985433	total: 10m 41s	remaining: 6m 50s
915:	learn: 0.7985306	total: 10m 42s	remaining: 6m 49s
916:	learn: 0.7985640	total: 10m 43s	remaining: 6m 48s
917:	learn: 0.7985415	total: 10m 43s	remaining: 6m 48s
918:	learn: 0.7985470	total: 10m 44s	remaining: 6m 47s
919:	learn: 0.7985262	total: 10m 45s	remaining: 6m 46s
920:	learn: 0.7985532	total: 10m 45s	remaining: 6m 46s
921:	learn: 0.7985182	total: 10m 46s	remaining: 6m 45s
922:	learn: 0.7985182	total: 10m 47s	remaining: 6m 44s
923:	learn: 0.7985423	total: 10m 47s	remaining: 6m 43s
924:	learn: 0.7985690	total: 10m 48s	remaining: 6m 43s
925:	learn: 0.7985723	total: 10m 49s	remaining: 6m 42s
926:	learn: 0.7985723	total: 10m 50s	remaining: 6m 41s
927:	learn: 0.7985537	total: 10m 50s	remaining: 6m 41s
928:	learn: 0.7985537	total: 10m 51s	remaining: 6m 40s
929:	learn: 0.7985537	total: 10m 52s	remaining: 6m 39s
930:	learn

1062:	learn: 0.8003887	total: 12m 22s	remaining: 5m 5s
1063:	learn: 0.8004148	total: 12m 23s	remaining: 5m 4s
1064:	learn: 0.8004812	total: 12m 24s	remaining: 5m 3s
1065:	learn: 0.8005209	total: 12m 24s	remaining: 5m 3s
1066:	learn: 0.8005067	total: 12m 25s	remaining: 5m 2s
1067:	learn: 0.8005068	total: 12m 26s	remaining: 5m 1s
1068:	learn: 0.8005325	total: 12m 26s	remaining: 5m 1s
1069:	learn: 0.8005325	total: 12m 27s	remaining: 5m
1070:	learn: 0.8004915	total: 12m 28s	remaining: 4m 59s
1071:	learn: 0.8005192	total: 12m 28s	remaining: 4m 58s
1072:	learn: 0.8005416	total: 12m 29s	remaining: 4m 58s
1073:	learn: 0.8005679	total: 12m 30s	remaining: 4m 57s
1074:	learn: 0.8005436	total: 12m 30s	remaining: 4m 56s
1075:	learn: 0.8005519	total: 12m 31s	remaining: 4m 56s
1076:	learn: 0.8005546	total: 12m 32s	remaining: 4m 55s
1077:	learn: 0.8005194	total: 12m 32s	remaining: 4m 54s
1078:	learn: 0.8005626	total: 12m 33s	remaining: 4m 54s
1079:	learn: 0.8005626	total: 12m 34s	remaining: 4m 53s
108

1210:	learn: 0.8020096	total: 14m 3s	remaining: 3m 21s
1211:	learn: 0.8020822	total: 14m 3s	remaining: 3m 20s
1212:	learn: 0.8021139	total: 14m 4s	remaining: 3m 19s
1213:	learn: 0.8021417	total: 14m 5s	remaining: 3m 19s
1214:	learn: 0.8021862	total: 14m 6s	remaining: 3m 18s
1215:	learn: 0.8022467	total: 14m 6s	remaining: 3m 17s
1216:	learn: 0.8021708	total: 14m 7s	remaining: 3m 17s
1217:	learn: 0.8021708	total: 14m 8s	remaining: 3m 16s
1218:	learn: 0.8021643	total: 14m 8s	remaining: 3m 15s
1219:	learn: 0.8021643	total: 14m 9s	remaining: 3m 14s
1220:	learn: 0.8022238	total: 14m 10s	remaining: 3m 14s
1221:	learn: 0.8022238	total: 14m 10s	remaining: 3m 13s
1222:	learn: 0.8022120	total: 14m 11s	remaining: 3m 12s
1223:	learn: 0.8022357	total: 14m 12s	remaining: 3m 12s
1224:	learn: 0.8021870	total: 14m 12s	remaining: 3m 11s
1225:	learn: 0.8022064	total: 14m 13s	remaining: 3m 10s
1226:	learn: 0.8021640	total: 14m 14s	remaining: 3m 10s
1227:	learn: 0.8021957	total: 14m 14s	remaining: 3m 9s
122

1358:	learn: 0.8034524	total: 15m 43s	remaining: 1m 37s
1359:	learn: 0.8035009	total: 15m 44s	remaining: 1m 37s
1360:	learn: 0.8035437	total: 15m 44s	remaining: 1m 36s
1361:	learn: 0.8036100	total: 15m 45s	remaining: 1m 35s
1362:	learn: 0.8036426	total: 15m 46s	remaining: 1m 35s
1363:	learn: 0.8036426	total: 15m 46s	remaining: 1m 34s
1364:	learn: 0.8036426	total: 15m 47s	remaining: 1m 33s
1365:	learn: 0.8036517	total: 15m 48s	remaining: 1m 33s
1366:	learn: 0.8037246	total: 15m 48s	remaining: 1m 32s
1367:	learn: 0.8037165	total: 15m 49s	remaining: 1m 31s
1368:	learn: 0.8037425	total: 15m 50s	remaining: 1m 30s
1369:	learn: 0.8037448	total: 15m 50s	remaining: 1m 30s
1370:	learn: 0.8037596	total: 15m 51s	remaining: 1m 29s
1371:	learn: 0.8038252	total: 15m 52s	remaining: 1m 28s
1372:	learn: 0.8037783	total: 15m 52s	remaining: 1m 28s
1373:	learn: 0.8037783	total: 15m 53s	remaining: 1m 27s
1374:	learn: 0.8037990	total: 15m 54s	remaining: 1m 26s
1375:	learn: 0.8037971	total: 15m 54s	remaining:

<catboost.core.CatBoostClassifier at 0x2651de32520>

In [55]:
meta_X['cat_01'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['cat_01'] = model.predict_proba(X_test)[:, 1]

In [None]:
filename = 'cat_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [89]:
check_overfitting(cat_model, X_train, y_train, X_test, y_test)

f1 train: 0.815
f1 test: 0.807
roc-auc train: 0.965
roc-auc test: 0.961


#### LightGBM

In [56]:
pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)
for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model = LGBMClassifier(**lg_params, n_jobs=-1)
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric=custom_lg_amex_metric,
              early_stopping_rounds=100,
              verbose=0)

    y_pred_val = model.predict(X_val)
    y_score_val = model.predict_proba(X_val)

    print("Fold:", fold + 1,
          "amex %.3f" % amex_metric(y_val.values, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

model = LGBMClassifier(**lg_params, n_jobs=-1)
model.fit(X_train, y_train)
meta_X['lgb_01'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['lgb_01'] = model.predict_proba(X_test)[:, 1]

Fold: 1 amex 0.784
---
Fold: 2 amex 0.776
---
Fold: 3 amex 0.785
---
Fold: 4 amex 0.786
---
Fold: 5 amex 0.789
---


In [None]:
filename = 'lgb_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [91]:
check_overfitting(lgb_model, X_train, y_train, X_test, y_test)

f1 train: 0.808
f1 test: 0.805
roc-auc train: 0.962
roc-auc test: 0.960


#### Final model: Logistic Regression

In [57]:
meta_X

Unnamed: 0,cat_01,lgb_01
0,0.938353,0.019090
1,0.012066,0.889008
2,0.002914,0.015307
3,0.034886,0.005202
4,0.075366,0.997805
...,...,...
344179,0.025993,0.201998
344180,0.632716,0.040039
344181,0.991914,0.856492
344182,0.001866,0.003952


In [84]:
final_clf = LogisticRegression(random_state=RAND)
final_clf.fit(meta_X, y_train)

LogisticRegression(random_state=10)

In [59]:
filename = 'final_model_1.sav'
pickle.dump(final_clf, open(filename, 'wb'))

In [81]:
y_pred_final = final_clf.predict(meta_X_test)
y_proba_final = final_clf.predict_proba(meta_X_test)

In [72]:
metrics = metrics.append(
    get_metrics(y_test.values,
                y_pred_final,
                y_proba_final,
                name='StackingClassifier_hand_tune'))

In [73]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,amex
0,LogisticRegression_Baseline,0.883787,0.95423,0.722824,0.893998,0.79935,0.282031,0.764476
1,RandomForest_Baseline,0.892451,0.953522,0.816657,0.753896,0.784023,0.264326,0.757461
2,XGBoost_Baseline,0.884022,0.95792,0.717815,0.909718,0.802453,0.260512,0.776851
3,LGBM_Baseline,0.881181,0.959101,0.708026,0.92086,0.800538,0.261274,0.78352
4,CatBoost_Baseline,0.887884,0.959946,0.72538,0.912445,0.80823,0.2536,0.784212
5,CatBoost_Optuna,0.884388,0.96006,0.714857,0.920793,0.804861,0.258429,0.786262
6,LGBM_Optuna,0.883953,0.959533,0.714821,0.9181,0.803808,0.258686,0.783749
0,StackingClassifier_hand_tune,0.900531,0.959923,0.785815,0.846602,0.815076,0.233997,0.78498


In [79]:
check_overfitting(final_clf, meta_X, y_train, meta_X_test, y_test)

f1 train: 0.808
f1 test: 0.815
roc-auc train: 0.957
roc-auc test: 0.960


Стекинг моделей дал лишь небольшее улучшение метрики соревнования по сравнению с бейзлайном, но показал лучший результат по logloss, f1. 

По метрике соревнования лидирует CatBoost с подобранными параметрами.