In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split   # разделение выборки на train и test

from sklearn.model_selection import cross_val_score     # кросс-валидация
from sklearn.model_selection import cross_val_predict

import xgboost as xgb, lightgbm as lgbm, catboost as catb

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization

In [2]:
path_2 = 'C:/Tefi/Alfa/Data/'

train_target = pd.read_csv(f'{path_2}alfabattle2_train_target.csv')
test_target = pd.read_csv(f'{path_2}alfabattle2_test_target_contest.csv')
sample = pd.read_csv(f'{path_2}alfabattle2_alpha_sample.csv')

path = 'C:/Tefi/Alfa/Data_pro/'

train_data_1 = pd.read_parquet(f'{path}train_data_1_pro.parquet', engine='pyarrow')
train_data_2 = pd.read_parquet(f'{path}train_data_2_pro.parquet', engine='pyarrow')
train_data_3 = pd.read_parquet(f'{path}train_data_3_pro.parquet', engine='pyarrow')
train_data_4 = pd.read_parquet(f'{path}train_data_4_pro.parquet', engine='pyarrow')
train_data_5 = pd.read_parquet(f'{path}train_data_5_pro.parquet', engine='pyarrow')

train_data = train_data_1.copy()
train_data = pd.concat([train_data, train_data_2], axis=0, ignore_index=True)
train_data = pd.concat([train_data, train_data_3], axis=0, ignore_index=True)
train_data = pd.concat([train_data, train_data_4], axis=0, ignore_index=True)
train_data = pd.concat([train_data, train_data_5], axis=0, ignore_index=True)

train_data = pd.merge(train_data, train_target, on='app_id', how='inner')
train_target = train_data['flag']
train_data.drop(['flag'], axis = 1, inplace = True)


In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(train_data, train_target, test_size=0.30)
x_valid, x_test, y_valid, y_test = train_test_split(x_valid, y_valid, test_size=0.30)

In [7]:
cb_params = {
    "n_estimators": 600,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 7,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
}

model_catb = catb.CatBoostClassifier(**cb_params)
model_catb.fit(X = x_train, y = y_train, eval_set=[(x_valid, y_valid)])

0:	test: 0.5561963	best: 0.5561963 (0)	total: 287ms	remaining: 2m 51s
10:	test: 0.5983422	best: 0.5998506 (8)	total: 1.04s	remaining: 55.7s
20:	test: 0.6084929	best: 0.6117935 (15)	total: 1.72s	remaining: 47.4s
30:	test: 0.6373023	best: 0.6373023 (30)	total: 2.5s	remaining: 45.8s
40:	test: 0.6536189	best: 0.6536189 (40)	total: 3.26s	remaining: 44.4s
50:	test: 0.6630649	best: 0.6630649 (50)	total: 4.04s	remaining: 43.5s
60:	test: 0.6776042	best: 0.6776042 (60)	total: 4.79s	remaining: 42.4s
70:	test: 0.6849677	best: 0.6849677 (70)	total: 5.57s	remaining: 41.5s
80:	test: 0.6922105	best: 0.6922105 (80)	total: 6.35s	remaining: 40.7s
90:	test: 0.6988855	best: 0.6988855 (90)	total: 7.14s	remaining: 40s
100:	test: 0.7024561	best: 0.7024561 (100)	total: 8s	remaining: 39.5s
110:	test: 0.7064194	best: 0.7064194 (110)	total: 8.82s	remaining: 38.9s
120:	test: 0.7095203	best: 0.7095203 (120)	total: 9.61s	remaining: 38s
130:	test: 0.7126196	best: 0.7126196 (130)	total: 10.4s	remaining: 37.3s
140:	tes

<catboost.core.CatBoostClassifier at 0x20788a42bb0>

In [8]:
train_score = roc_auc_score(y_train, model_catb.predict(x_train, prediction_type = "Probability")[:,1])
valid_score = roc_auc_score(y_valid, model_catb.predict(x_valid, prediction_type = "Probability")[:,1])
test_score = roc_auc_score(y_test, model_catb.predict(x_test, prediction_type = "Probability")[:,1])

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.863, Valid-score: 0.739, Test-score: 0.734)

Train-score: 0.863, Valid-score: 0.739, Test-score: 0.734)


In [9]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 600,
    "reg_lambda": 100,
    "max_depth": 7,
    "gamma": 10,
    "nthread": 6,
    "seed": 27,
    'colsample_bytree': 0.555,
    'min_child_weights': 369
}

dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)
dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

In [10]:
model_xgb = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=600,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    early_stopping_rounds=50,
    verbose_eval=10,
)

Parameters: { min_child_weights, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.50000	valid-auc:0.50000
[10]	train-auc:0.50000	valid-auc:0.50000
[20]	train-auc:0.61425	valid-auc:0.59282
[30]	train-auc:0.69767	valid-auc:0.67248
[40]	train-auc:0.72200	valid-auc:0.68750
[50]	train-auc:0.73904	valid-auc:0.69932
[60]	train-auc:0.75362	valid-auc:0.70760
[70]	train-auc:0.76658	valid-auc:0.71257
[80]	train-auc:0.77573	valid-auc:0.71894
[90]	train-auc:0.78201	valid-auc:0.72191
[100]	train-auc:0.78904	valid-auc:0.72476
[110]	train-auc:0.79589	valid-auc:0.72800
[120]	train-auc:0.80002	valid-auc:0.72983
[130]	train-auc:0.80315	valid-auc:0.73144
[140]	train-auc:0.80479	valid-auc:0.73246
[150]	train-auc:0.80735	valid-auc:0.73242
[160]	train-auc:0.80998	valid-au

In [14]:
train_score = roc_auc_score(y_train, model_xgb.predict(xgb.DMatrix(x_train)))
valid_score = roc_auc_score(y_valid, model_xgb.predict(xgb.DMatrix(x_valid)))
test_score = roc_auc_score(y_test, model_xgb.predict(xgb.DMatrix(x_test)))

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.817, Valid-score: 0.734, Test-score: 0.735)

Train-score: 0.817, Valid-score: 0.734, Test-score: 0.735)


In [15]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 600,
    "n_jobs": 6,
    "seed": 27,
}

dtrain = lgbm.Dataset(
    data=x_train, label=y_train
)
dvalid = lgbm.Dataset(
    data=x_valid, label=y_valid
)

In [16]:
model_lgb = lgbm.train(
    params=params,
    train_set=dtrain,
    num_boost_round=600,
    valid_sets=[dtrain, dvalid],
    categorical_feature="auto",
    early_stopping_rounds=50,
    verbose_eval=10
)

[LightGBM] [Info] Number of positive: 1834, number of negative: 75656
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 93665
[LightGBM] [Info] Number of data points in the train set: 77490, number of used features: 377
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.023668 -> initscore=-3.719697
[LightGBM] [Info] Start training from score -3.719697
Training until validation scores don't improve for 50 rounds
[10]	training's auc: 0.709059	valid_1's auc: 0.679748
[20]	training's auc: 0.722157	valid_1's auc: 0.68524
[30]	training's auc: 0.737049	valid_1's auc: 0.687125
[40]	training's auc: 0.760307	valid_1's auc: 0.691832
[50]	training's auc: 0.77489	valid_1's auc: 0.696832
[60]	training's auc: 0.784791	valid_1's auc: 0.69922
[70]	training's auc: 0.797003	valid_1's auc: 0.699651
[80]	training's auc: 0.810468	valid_1's auc: 0.700432
[90]	training's auc: 0.821337	valid_1's auc: 0.701743
[100]	training's auc: 0.833728	valid_1's auc: 0.704253
[110]	trai

In [18]:
train_score = roc_auc_score(y_train, model_lgb.predict(x_train))
valid_score = roc_auc_score(y_valid, model_lgb.predict(x_valid))
test_score = roc_auc_score(y_test, model_lgb.predict(x_test))

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.968, Valid-score: 0.722, Test-score: 0.73)

Train-score: 0.968, Valid-score: 0.722, Test-score: 0.73)


In [None]:
x_pred_train_cat = model_catb.predict(x_test, prediction_type =  "Probability")[:,1]
x_pred_train_xgb = model_xgb.predict(xgb.DMatrix(data=x_test))
x_pred_train_lgb = model_lgb.predict(x_test)