In [None]:
# ! git clone --recursive https://github.com/Microsoft/LightGBM
# ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
!pip install catboost
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.0-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 4.7 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 9.8 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 66.7 MB/s 
Collecting Mako
  Do

In [None]:
"""
Please download methods.py 
from https://drive.google.com/file/d/12tC3SOtcZUZCDdGnz-pk0X_uj044styJ/view?usp=sharing
before import
"""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List
import methods
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import pyarrow.feather as feather
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
import optuna
from sklearn.model_selection import StratifiedKFold
from scipy import stats
import random
import joblib
from google.colab import drive
drive.mount('/content/drive')
from typing import Any

Mounted at /content/drive


# Constants

In [None]:
RAND = 10
N_FOLDS = 6

# Load Data

In [None]:
df = feather.read_feather(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/Processed data/df.feather'
)
df[df.select_dtypes('object').columns] = df[df.select_dtypes(
    'object').columns].astype('category')

# Prepearing samples

In [None]:
x_train, y_train, x_test, y_test = methods.data_split(df, False, 0.2, RAND)

x_train: (285958, 19)
y_train: (285958,)
x_test: (71490, 19)
y_test: (71490,)


# Baseline

In [None]:
# Рассчитает коэффициент дисбалланса и сохраним категориальные признаки
class_ratio = float(np.sum(y_train == 0)) / float(np.sum(y_train == 1))
cat_features = x_train.select_dtypes(include='category').columns.to_list()

In [None]:
cat_models = {'Catboost': CatBoostClassifier(random_state=RAND, 
                                            scale_pos_weight=class_ratio, 
                                            cat_features=cat_features, task_type='GPU'),
              'Lightgbm': LGBMClassifier(random_state=RAND, 
                                        scale_pos_weight=class_ratio, 
                                        categorical_feature = cat_features, 
                                        device='gpu', 
                                        verbose=-1)}

In [None]:
metrics = methods.base_models_fit_compare(dict_of_models=cat_models, 
                                         x_train=x_train,
                                         y_train=y_train, 
                                         x_test=x_test, 
                                         y_test=y_test)
metrics

  0%|          | 0/2 [00:00<?, ?it/s]

Model: Catboost
F1 Train: 0.521
F1 Test: 0.506
No overfitting
------------

Model: Lightgbm
F1 Train: 0.427
F1 Test: 0.416
No overfitting
------------



Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
0,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837


In [None]:
feather.write_feather(
    metrics, '/content/drive/MyDrive/Colab Notebooks/Pet_project/report/metrics.feather')

# Tuning models

## Catboost

In [None]:
def objective_cat(trial,
                  x: pd.DataFrame,
                  y: pd.DataFrame,
                  N_Folds: int,
                  random_state: int) -> float:
    """
    Function that Optuna will optimize
    :param trial: optuna trial
    :param x: train data
    :param y: train labels
    :param N_Folds: number of folds for cross validation
    :param random_state: random state
    :return: F1 fold average
    """

    catboost_params = {'iterations': trial.suggest_categorical('iterations', [1000, 2000]),
                       'learning_rate': trial.suggest_float(
                           'learning_rate', 0.001, 0.3, log=True),
                       'max_depth': trial.suggest_int('max_depth', 4, 10),
                       'l2_leaf_reg': trial.suggest_uniform('l2_leaf_reg', 1e-5, 1e2),
                       'random_strength': trial.suggest_float('random_strength', 1, 10),
                       'bootstrap_type': trial.suggest_categorical(
                           'bootstrap_type', ['Bayesian',
                                              'Bernoulli',
                                              'MVS',
                                              'No']),
                       'min_data_in_leaf': trial.suggest_categorical(
                           'min_data_in_leaf', [100, 500, 1000]),
                       'leaf_estimation_iterations': trial.suggest_int(
                           'leaf_estimation_iterations', 1, 15),
                       'loss_function': trial.suggest_categorical(
                           'loss_function', ['Logloss']),
                       'eval_metric': trial.suggest_categorical('eval_metric', ['F1']),
                       'random_state': random_state,

                       'scale_pos_weight': class_ratio}

    if catboost_params['bootstrap_type'] == 'Bayesian':
        catboost_params['bagging_temperature'] = trial.suggest_float(
            'bagging_temperature', 0, 10)
    elif catboost_params['bootstrap_type'] == 'Bernoulli':
        catboost_params['subsample'] = trial.suggest_float(
            'subsample', 0.1, 1, log=True)

    cv = StratifiedKFold(n_splits=N_Folds, 
                        shuffle=True,
                        random_state=random_state)
    
    predict_score = np.empty(N_Folds)

    for fold, (train_index, test_index) in enumerate(cv.split(x, y)):
        
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = CatBoostClassifier(**catboost_params, 
                                    cat_features=cat_features, 
                                    task_type='GPU')

        model.fit(x_train, 
                  y_train,
                  eval_set=[(x_test, y_test)],
                  early_stopping_rounds=100,
                  verbose=0)

        pred = model.predict(x_test)
        predict_score[fold] = f1_score(y_test, pred)

    return np.mean(predict_score)

In [None]:
study_catboost = optuna.create_study(direction='maximize', study_name='Catboost')
func = lambda trial: objective_cat(trial, x_train, y_train, N_FOLDS, RAND)
study_catboost.optimize(func, n_trials=15, show_progress_bar=True)

[32m[I 2022-08-25 08:11:26,023][0m A new study created in memory with name: Catboost[0m


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2022-08-25 08:14:37,869][0m Trial 0 finished with value: 0.40601870031518894 and parameters: {'iterations': 1000, 'learning_rate': 0.06038335292018286, 'max_depth': 4, 'l2_leaf_reg': 69.38824881846611, 'random_strength': 2.7609182888116863, 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 100, 'leaf_estimation_iterations': 13, 'loss_function': 'Logloss', 'eval_metric': 'F1', 'subsample': 0.29650054648550417}. Best is trial 0 with value: 0.40601870031518894.[0m
[32m[I 2022-08-25 08:18:41,376][0m Trial 1 finished with value: 0.2145494649871614 and parameters: {'iterations': 1000, 'learning_rate': 0.012900438384339559, 'max_depth': 9, 'l2_leaf_reg': 30.208028144716, 'random_strength': 9.502666357383756, 'bootstrap_type': 'MVS', 'min_data_in_leaf': 100, 'leaf_estimation_iterations': 15, 'loss_function': 'Logloss', 'eval_metric': 'F1'}. Best is trial 0 with value: 0.40601870031518894.[0m
[32m[I 2022-08-25 08:21:46,689][0m Trial 2 finished with value: 0.3785839410718858 and 

In [None]:
study_catboost.best_params

{'iterations': 2000,
 'learning_rate': 0.23703590654359807,
 'max_depth': 7,
 'l2_leaf_reg': 2.5844535648594356,
 'random_strength': 7.489861283846848,
 'bootstrap_type': 'Bernoulli',
 'min_data_in_leaf': 1000,
 'leaf_estimation_iterations': 8,
 'loss_function': 'Logloss',
 'eval_metric': 'F1',
 'subsample': 0.9976875196067346}

In [None]:
study_catboost.best_value

0.689273209583421

In [None]:
# Проверим модель на переобучение
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True,
                     random_state=RAND)
predict_score = np.empty(N_FOLDS)

for fold, (train_index, test_index) in tqdm_notebook(enumerate(cv.split(x_train, y_train))):

    x_train_, x_val = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_, y_val = y_train[train_index], y_train[test_index]

    model_catboost = CatBoostClassifier(**study_catboost.best_params, 
                                       task_type="GPU", cat_features=cat_features )

    model_catboost.fit(x_train_, y_train_, verbose=0, eval_set=[
              (x_val, y_val)], early_stopping_rounds=50)

    pred = model_catboost.predict(x_val)
    predict_score[fold] = f1_score(y_val, pred)

np.mean(predict_score)

0it [00:00, ?it/s]

0.7399169494783092

In [None]:
pred_catboost = model_catboost.predict(x_test)
methods.check_overfitting(model_catboost, x_train, y_train, x_test, y_test)

F1 Train: 0.803
F1 Test: 0.741
No overfitting


In [None]:
# Сохраним модель и ее параметры
joblib.dump(model_catboost, os.path.join(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Catboost.pkl'))
joblib.dump(study_catboost.best_params, os.path.join(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Catboost_best_params.json'))

['/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Catboost_best_params.json']

## Lightgbm

In [None]:
def objective_lgbm(trial, 
                   X: pd.DataFrame, 
                   y: pd.DataFrame, 
                   N_Folds: int, 
                   random_state: int) -> float:
    """
    Function that Optuna will optimize
    :param trial: optuna trial
    :param x: train data
    :param y: train labels
    :param N_Folds: number of folds for cross validation
    :param random_state: random state
    :return: F1 fold average
    """
    lgbm_params = {"n_estimators": trial.suggest_categorical("n_estimators", 
                                                             [1000, 2000, 3000]),
                  "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                  "num_leaves": trial.suggest_int("num_leaves", 20, 300),
                  "max_depth": trial.suggest_int("max_depth", 3, 12),
                  "max_bin": trial.suggest_categorical("max_bin", [200]),
                  "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 300),
                  "lambda_l1": trial.suggest_int("lambda_l1", 0, 100),
                  "lambda_l2": trial.suggest_int("lambda_l2", 0, 100),
                  "min_gain_to_split": trial.suggest_int("min_gain_to_split", 0, 15),
                  "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.99),
                  "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
                  "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.99),
                  "random_state": random_state,
                  'scale_pos_weight': class_ratio
                  }

    cv = StratifiedKFold(n_splits=N_Folds, shuffle=True, random_state=random_state)

    cv_predicts = np.empty(N_Folds)

    for fold, (train_index, test_index) in enumerate(cv.split(X, y)):
      
        x_train, x_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = LGBMClassifier(**lgbm_params, 
                               verbose=-1, 
                               device = 'gpu')
        
        model.fit(x_train,
                y_train,
                eval_set=[(x_test, y_test)],
                eval_metric=methods.lgb_f1_score)

        preds = model.predict(x_test)
        cv_predicts[fold] = f1_score(y_test, preds)

    return np.mean(cv_predicts)

In [None]:
study_lgbm = optuna.create_study(direction="maximize", study_name="LGBM")
func = lambda trial: objective_lgbm(trial, 
                                    x_train, 
                                    y_train, 
                                    N_Folds=N_FOLDS, 
                                    random_state=RAND)
study_lgbm.optimize(func, n_trials=15, show_progress_bar=True)

[32m[I 2022-08-26 03:10:34,689][0m A new study created in memory with name: LGBM[0m


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2022-08-26 03:21:59,846][0m Trial 0 finished with value: 0.5670001793394461 and parameters: {'n_estimators': 2000, 'learning_rate': 0.258748743624555, 'num_leaves': 207, 'max_depth': 12, 'max_bin': 200, 'min_data_in_leaf': 14, 'lambda_l1': 41, 'lambda_l2': 56, 'min_gain_to_split': 4, 'bagging_fraction': 0.7233847191466822, 'bagging_freq': 1, 'feature_fraction': 0.8757853975556864}. Best is trial 0 with value: 0.5670001793394461.[0m
[32m[I 2022-08-26 03:37:18,351][0m Trial 1 finished with value: 0.3865210674660979 and parameters: {'n_estimators': 3000, 'learning_rate': 0.0404981111804727, 'num_leaves': 111, 'max_depth': 6, 'max_bin': 200, 'min_data_in_leaf': 163, 'lambda_l1': 81, 'lambda_l2': 70, 'min_gain_to_split': 14, 'bagging_fraction': 0.8724880858851887, 'bagging_freq': 1, 'feature_fraction': 0.41529207505114135}. Best is trial 0 with value: 0.5670001793394461.[0m
[32m[I 2022-08-26 03:53:27,579][0m Trial 2 finished with value: 0.5247013863762631 and parameters: {'n_

In [None]:
study_lgbm.best_params

{'n_estimators': 1000,
 'learning_rate': 0.27261296782200084,
 'num_leaves': 199,
 'max_depth': 11,
 'max_bin': 200,
 'min_data_in_leaf': 125,
 'lambda_l1': 52,
 'lambda_l2': 64,
 'min_gain_to_split': 0,
 'bagging_fraction': 0.8034604758717647,
 'bagging_freq': 1,
 'feature_fraction': 0.6797918047926013}

In [None]:
study_lgbm.best_value

0.6590898836333825

In [None]:
lgbm_best_params = {'n_estimators': 2000,
                    'learning_rate': 0.2985544138753638,
                    'num_leaves': 133,
                    'max_depth': 3,
                    'max_bin': 200,
                    'min_data_in_leaf': 14,
                    'lambda_l1': 3,
                    'lambda_l2': 30,
                    'min_gain_to_split': 0,
                    'bagging_fraction': 0.9817006796341091,
                    'bagging_freq': 1,
                    'feature_fraction': 0.9799120301557371}

In [None]:
# Проверяем модель на переобучение
cv = StratifiedKFold(n_splits=N_FOLDS, 
                    shuffle=True,
                    random_state=RAND)

predict_score = np.empty(N_FOLDS)

for fold, (train_index, test_index) in tqdm_notebook(enumerate(cv.split(x_train, y_train))):

    x_train_, x_val = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_, y_val = y_train[train_index], y_train[test_index]

    model_lgbm = LGBMClassifier(**study_lgbm.best_params, 
                                device='gpu', 
                                verbose=-1,
                                scale_pos_weight=class_ratio,
                                random_state=RAND)

    model_lgbm.fit(x_train_, 
                  y_train_, 
                  eval_set=[(x_val, y_val)], 
                  eval_metric = methods.lgb_f1_score)

    pred = model_lgbm.predict(x_val)
    predict_score[fold] = f1_score(y_val, pred)
    
np.mean(predict_score)

0it [00:00, ?it/s]

0.6592939878176122

In [None]:
pred_lgbm = model_lgbm.predict(x_test)
methods.check_overfitting(model_lgbm, x_train, y_train, x_test, y_test)

F1 Train: 0.701
F1 Test: 0.664
No overfitting


In [None]:
# Сохраним модель и ее параметры
joblib.dump(model_lgbm, os.path.join(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Lightgbm.txt'))
joblib.dump(study_lgbm.best_params, os.path.join(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Lightgbm_best_params.json'))

['/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Lightgbm_best_params.json']