In [None]:
! git clone --recursive https://github.com/Microsoft/LightGBM
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
!pip install catboost

Cloning into 'LightGBM'...
remote: Enumerating objects: 27318, done.[K
remote: Counting objects: 100% (432/432), done.[K
remote: Compressing objects: 100% (228/228), done.[K
remote: Total 27318 (delta 257), reused 302 (delta 194), pack-reused 26886[K
Receiving objects: 100% (27318/27318), 19.62 MiB | 29.50 MiB/s, done.
Resolving deltas: 100% (20168/20168), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.       

In [None]:
"""
Please download methods.py 
from https://drive.google.com/file/d/12tC3SOtcZUZCDdGnz-pk0X_uj044styJ/view?usp=sharing
before import
"""

'\nPlease download methods.py \nfrom https://drive.google.com/file/d/12tC3SOtcZUZCDdGnz-pk0X_uj044styJ/view?usp=sharing\nbefore import\n'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from typing import List
import methods
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
import pyarrow.feather as feather
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from scipy import stats
import random
import joblib
from google.colab import drive
drive.mount('/content/drive')
from typing import Any

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Constants

In [None]:
RAND = 10
N_FOLDS = 6

# Load data

In [None]:
df = feather.read_feather(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/Processed data/df.feather'
)
df[df.select_dtypes('object').columns] = df[df.select_dtypes(
    'object').columns].astype('category')

In [None]:
x_train, y_train, x_test, y_test = methods.data_split(df, False, 0.2, RAND)

x_train: (285958, 19)
y_train: (285958,)
x_test: (71490, 19)
y_test: (71490,)


In [None]:
# рассчитаем коэффициент дисбалланса классов и сохраним категориальные признаки
class_ratio = float(np.sum(y_train == 0)) / float(np.sum(y_train == 1))
cat_features = x_train.select_dtypes(include='category').columns.to_list()

In [None]:
# сюда будем сохранять веса признаков
feature_importance = []
# подгрузим файл с метриками
metrics = feather.read_feather(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/report/metrics.feather')

#Lightgbm

## Model with best params

In [None]:
lgbm_params = joblib.load(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Lightgbm_best_params.json')

In [None]:
model_lgbm = LGBMClassifier(**lgbm_params, 
                           random_state=RAND, 
                           scale_pos_weight=class_ratio, 
                           categorical_feature=cat_features, 
                           verbose=-1, 
                           device='gpu')

In [None]:
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()

pred_val = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
  
    X_train_, X_val = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_, y_val = y_train[train_idx], y_train[test_idx]

    model = model_lgbm
    model.fit(X_train_,
             y_train_,
             eval_set=[(X_val, y_val)],
             eval_metric=methods.lgb_f1_score)

    y_pred_val = model.predict(X_val)

    pred_val.append(y_pred_val)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = metrics.append(methods.get_metrics(y_test, 
                                             y_pred, 
                                             y_score[:, 1], 
                                             'LGBM__best_params_CV'))

feature_importance.append(model.feature_importances_)

meta_X['lgb_01'] = np.concatenate(pred_val)
meta_X_test['lgb_01'] = model.predict(x_test)



In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
1,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837
0,LGBM__best_params_CV,0.954777,0.986134,0.529986,0.944751,0.679043,0.117704


## Baseline model

In [None]:
base_lgbm = LGBMClassifier(random_state=RAND, 
                          scale_pos_weight=class_ratio, 
                          categorical_feature=cat_features, 
                          verbose=-1,
                          device='gpu')

In [None]:
pred_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
  
    X_train_, X_val = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_, y_val = y_train[train_idx], y_train[test_idx]

    model = base_lgbm
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              eval_metric=methods.lgb_f1_score)

    y_pred_val = model.predict(X_val)

    pred_val.append(y_pred_val)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = metrics.append(methods.get_metrics(y_test, 
                                              y_pred, 
                                              y_score[:, 1], 
                                              'LGBM__Baseline_CV'))

feature_importance.append(model.feature_importances_)

meta_X['lgb_02'] = np.concatenate(pred_val)
meta_X_test['lgb_02'] = model.predict(x_test)

In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
1,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837
0,LGBM__best_params_CV,0.954777,0.986134,0.529986,0.944751,0.679043,0.117704
0,LGBM__Baseline_CV,0.871758,0.955323,0.270857,0.905801,0.417016,0.281926


# Catboost

## Model with best params

In [None]:
catboost_params = joblib.load(
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/models/Catboost_best_params.json'
)

In [None]:
model_catboost = CatBoostClassifier(**catboost_params, 
                                   random_state=RAND, 
                                   scale_pos_weight=class_ratio, 
                                   cat_features=cat_features, 
                                   task_type='GPU')

In [None]:
pred_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
  
    X_train_, X_val = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_, y_val = y_train[train_idx], y_train[test_idx]

    model = model_catboost
    model.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)], 
              early_stopping_rounds=100,
              verbose=False)

    y_pred_val = model.predict(X_val)

    pred_val.append(y_pred_val)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = metrics.append(methods.get_metrics(y_test, 
                                              y_pred, 
                                              y_score[:, 1], 
                                              'Catboost__best_params_CV'))

feature_importance.append(model.feature_importances_)

meta_X['cat_01'] = np.concatenate(pred_val)
meta_X_test['cat_01'] = model.predict(x_test)

In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
1,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837
0,LGBM__best_params_CV,0.954777,0.986134,0.529986,0.944751,0.679043,0.117704
0,LGBM__Baseline_CV,0.871758,0.955323,0.270857,0.905801,0.417016,0.281926
0,Catboost__best_params_CV,0.969758,0.987876,0.636109,0.94116,0.759135,0.08453


## Baseline model

In [None]:
base_catboost = CatBoostClassifier(random_state=RAND, 
                                  scale_pos_weight=class_ratio, 
                                  eval_metric='F1', 
                                  cat_features=cat_features,
                                  task_type='GPU')

In [None]:
pred_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
  
    X_train_, X_val = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_, y_val = y_train[train_idx], y_train[test_idx]

    model = base_catboost
    model.fit(X_train_,
             y_train_,
             eval_set=[(X_val, y_val)], 
             early_stopping_rounds=100,
             verbose=False)

    y_pred_val = model.predict(X_val)

    pred_val.append(y_pred_val)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = metrics.append(methods.get_metrics(y_test, 
                                              y_pred, 
                                              y_score[:, 1], 
                                              'Catboost__baseline_CV'))

feature_importance.append(model.feature_importances_)

meta_X['cat_02'] = np.concatenate(pred_val)
meta_X_test['cat_02'] = model.predict(x_test)

In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
1,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837
0,LGBM__best_params_CV,0.954777,0.986134,0.529986,0.944751,0.679043,0.117704
0,LGBM__Baseline_CV,0.871758,0.955323,0.270857,0.905801,0.417016,0.281926
0,Catboost__best_params_CV,0.969758,0.987876,0.636109,0.94116,0.759135,0.08453
0,Catboost__baseline_CV,0.889159,0.958245,0.300925,0.898619,0.450866,0.281354


In [None]:
# сохраним предсказания стека, для экспериментов с мета моделью
feather.write_feather(
    meta_X, 
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/Processed data/meta_X.feather')
feather.write_feather(
    meta_X_test, 
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/Processed data/meta_X_test.feather')

# Meta model

In [None]:
# для мета модели выберем байесовскую модель
final_clf = GaussianNB()
final_clf.fit(meta_X, y_train)

GaussianNB()

In [None]:
y_pred_final = final_clf.predict(meta_X_test)
y_score_final = final_clf.predict_proba(meta_X_test)
metrics = metrics.append(methods.get_metrics(y_test, 
                                             y_pred_final, 
                                             y_score_final[:, 1], 
                                             'Meta_model'))

In [None]:
methods.check_overfitting(final_clf, meta_X, y_train, meta_X_test, y_test)

F1 Train: 0.675
F1 Test: 0.764
No overfitting


In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost,0.910309,0.966486,0.350887,0.907459,0.506085,0.24297
1,Lightgbm,0.870835,0.955693,0.269805,0.90884,0.416087,0.280837
0,LGBM__best_params_CV,0.954777,0.986134,0.529986,0.944751,0.679043,0.117704
0,LGBM__Baseline_CV,0.871758,0.955323,0.270857,0.905801,0.417016,0.281926
0,Catboost__best_params_CV,0.969758,0.987876,0.636109,0.94116,0.759135,0.08453
0,Catboost__baseline_CV,0.889159,0.958245,0.300925,0.898619,0.450866,0.281354
0,Meta_model,0.971912,0.973262,0.664289,0.900276,0.764485,0.396266


In [None]:
# Сохраняем метрики
feather.write_feather(
    metrics, 
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/report/metrics_all.feather')

In [None]:
# Сохраняем важные признаки
joblib.dump(
    feature_importance, 
    '/content/drive/MyDrive/Colab Notebooks/Pet_project/report/feature_importances.txt')

['/content/drive/MyDrive/Colab Notebooks/Pet_project/report/feature_importances.txt']