In [3]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split   # разделение выборки на train и test

from sklearn.model_selection import cross_val_score     # кросс-валидация
from sklearn.model_selection import cross_val_predict

import xgboost as xgb, lightgbm as lgbm, catboost as catb

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization

In [4]:
path_2 = 'C:/Tefi/Alfa/Data/'

test_target = pd.read_csv(f'{path_2}alfabattle2_test_target_contest.csv')
sample = pd.read_csv(f'{path_2}alfabattle2_alpha_sample.csv')

path = 'C:/Tefi/Alfa/Data_pro/'

train_data = pd.read_parquet(f'{path}train_data_pro.parquet', engine='pyarrow')
train_target = pd.read_csv(f'{path}train_target_pro.csv')
test_data = pd.read_parquet(f'{path}test_data_pro.parquet', engine='pyarrow')


In [5]:
x_train, x_valid, y_train, y_valid = train_test_split(train_data, train_target, test_size=0.30)
x_valid, x_test, y_valid, y_test = train_test_split(x_valid, y_valid, test_size=0.30)

In [6]:
cb_params = {
    "n_estimators": 600,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 7,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
}

model_catb = catb.CatBoostClassifier(**cb_params)
model_catb.fit(X = x_train, y = y_train, eval_set=[(x_valid, y_valid)])

0:	test: 0.5653660	best: 0.5653660 (0)	total: 668ms	remaining: 6m 39s
10:	test: 0.6677542	best: 0.6677542 (10)	total: 4.64s	remaining: 4m 8s
20:	test: 0.7000551	best: 0.7000551 (20)	total: 9.15s	remaining: 4m 12s
30:	test: 0.7168629	best: 0.7168629 (30)	total: 13.9s	remaining: 4m 15s
40:	test: 0.7270894	best: 0.7270894 (40)	total: 19s	remaining: 4m 18s
50:	test: 0.7318175	best: 0.7318502 (49)	total: 24s	remaining: 4m 18s
60:	test: 0.7344009	best: 0.7344009 (60)	total: 29.2s	remaining: 4m 18s
70:	test: 0.7361951	best: 0.7361951 (70)	total: 34.2s	remaining: 4m 15s
80:	test: 0.7373863	best: 0.7373863 (80)	total: 39.2s	remaining: 4m 11s
90:	test: 0.7383728	best: 0.7385234 (88)	total: 44s	remaining: 4m 6s
100:	test: 0.7398692	best: 0.7398692 (100)	total: 48.9s	remaining: 4m 1s
110:	test: 0.7411893	best: 0.7411893 (110)	total: 53.4s	remaining: 3m 55s
120:	test: 0.7423510	best: 0.7423747 (118)	total: 58.1s	remaining: 3m 50s
130:	test: 0.7433357	best: 0.7433357 (130)	total: 1m 2s	remaining: 3m

<catboost.core.CatBoostClassifier at 0x2848d1381c0>

In [7]:
train_score = roc_auc_score(y_train, model_catb.predict(x_train, prediction_type = "Probability")[:,1])
valid_score = roc_auc_score(y_valid, model_catb.predict(x_valid, prediction_type = "Probability")[:,1])
test_score = roc_auc_score(y_test, model_catb.predict(x_test, prediction_type = "Probability")[:,1])

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.791, Valid-score: 0.766, Test-score: 0.771)

Train-score: 0.791, Valid-score: 0.766, Test-score: 0.771)


In [8]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 600,
    "reg_lambda": 100,
    "max_depth": 7,
    "gamma": 10,
    "nthread": 6,
    "seed": 27,
    'colsample_bytree': 0.555,
    'min_child_weights': 369
}

dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)
dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

In [9]:
model_xgb = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=600,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    early_stopping_rounds=50,
    verbose_eval=10,
)

Parameters: { min_child_weights, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.50000	valid-auc:0.50000
[10]	train-auc:0.66051	valid-auc:0.65768
[20]	train-auc:0.70674	valid-auc:0.69666
[30]	train-auc:0.73293	valid-auc:0.71602
[40]	train-auc:0.75509	valid-auc:0.73222
[50]	train-auc:0.76902	valid-auc:0.74112
[60]	train-auc:0.77916	valid-auc:0.74809
[70]	train-auc:0.78785	valid-auc:0.75310
[80]	train-auc:0.79531	valid-auc:0.75714
[90]	train-auc:0.80109	valid-auc:0.76048
[100]	train-auc:0.80629	valid-auc:0.76238
[110]	train-auc:0.80983	valid-auc:0.76413
[120]	train-auc:0.81323	valid-auc:0.76550
[130]	train-auc:0.81631	valid-auc:0.76663
[140]	train-auc:0.81887	valid-auc:0.76789
[150]	train-auc:0.82109	valid-auc:0.76882
[160]	train-auc:0.82337	valid-au

In [10]:
train_score = roc_auc_score(y_train, model_xgb.predict(xgb.DMatrix(x_train)))
valid_score = roc_auc_score(y_valid, model_xgb.predict(xgb.DMatrix(x_valid)))
test_score = roc_auc_score(y_test, model_xgb.predict(xgb.DMatrix(x_test)))

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.849, Valid-score: 0.775, Test-score: 0.781)

Train-score: 0.849, Valid-score: 0.775, Test-score: 0.781)


In [11]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 600,
    "n_jobs": 6,
    "seed": 27,
}

dtrain = lgbm.Dataset(
    data=x_train, label=y_train
)
dvalid = lgbm.Dataset(
    data=x_valid, label=y_valid
)

In [12]:
model_lgb = lgbm.train(
    params=params,
    train_set=dtrain,
    num_boost_round=600,
    valid_sets=[dtrain, dvalid],
    categorical_feature="auto",
    early_stopping_rounds=50,
    verbose_eval=10
)

[LightGBM] [Info] Number of positive: 18642, number of negative: 656025
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95670
[LightGBM] [Info] Number of data points in the train set: 674667, number of used features: 387
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027631 -> initscore=-3.560782
[LightGBM] [Info] Start training from score -3.560782
Training until validation scores don't improve for 50 rounds
[10]	training's auc: 0.700587	valid_1's auc: 0.692223
[20]	training's auc: 0.710047	valid_1's auc: 0.699948
[30]	training's auc: 0.716602	valid_1's auc: 0.70504
[40]	training's auc: 0.721006	valid_1's auc: 0.708416
[50]	training's auc: 0.724575	valid_1's auc: 0.711074
[60]	training's auc: 0.729179	valid_1's auc: 0.714544
[70]	training's auc: 0.732844	valid_1's auc: 0.717705
[80]	training's auc: 0.736204	valid_1's auc: 0.720232
[90]	training's auc: 0.739401	valid_1's auc: 0.722559
[100]	training's auc: 0.742827	valid_1's auc: 0.724987
[110]

In [13]:
train_score = roc_auc_score(y_train, model_lgb.predict(x_train))
valid_score = roc_auc_score(y_valid, model_lgb.predict(x_valid))
test_score = roc_auc_score(y_test, model_lgb.predict(x_test))

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)})")

# Train-score: 0.813, Valid-score: 0.761, Test-score: 0.764)

Train-score: 0.813, Valid-score: 0.761, Test-score: 0.764)


In [14]:
x_pred_train_cat = model_catb.predict(x_test, prediction_type =  "Probability")[:,1]
x_pred_train_xgb = model_xgb.predict(xgb.DMatrix(data=x_test))
x_pred_train_lgb = model_lgb.predict(x_test)

In [15]:
y_pred_test_cat = model_catb.predict(test_data, prediction_type =  "Probability")[:,1]
y_pred_test_xgb = model_xgb.predict(xgb.DMatrix(data=test_data))
y_pred_test_lgb = model_lgb.predict(test_data)

In [43]:
result_test_cat = pd.DataFrame({
    "app_id": test_data['app_id'],
    "flag": y_pred_test_cat})
result_test_cat.head(2)

Unnamed: 0,app_id,flag
0,1063620,0.020593
1,1063621,0.043897


In [33]:
result_test_xgb = pd.DataFrame({
    "app_id": test_data['app_id'],
    "flag": y_pred_test_xgb})
result_test_xgb.head(2)

Unnamed: 0,app_id,flag
0,1063620,0.012407
1,1063621,0.019296


In [74]:
filename = 'Tefi.csv'
result_test_xgb.to_csv(filename, index=None)

# 0.7424370

In [34]:
result_test_lgb = pd.DataFrame({
    "app_id": test_data['app_id'],
    "flag": y_pred_test_lgb})
result_test_lgb.head(2)

Unnamed: 0,app_id,flag
0,1063620,0.013461
1,1063621,0.053269


In [None]:
stop 1

In [63]:
result_train_cat = pd.DataFrame({
    "app_id": x_test.reset_index()['app_id'],
    "flag": x_pred_train_cat})
result_train_cat.head(2)

Unnamed: 0,app_id,flag
0,44024,0.015414
1,277968,0.00361


In [64]:
result_train_xgb = pd.DataFrame({
    "app_id": x_test.reset_index()['app_id'],
    "flag": x_pred_train_xgb})
result_train_xgb.head(2)

Unnamed: 0,app_id,flag
0,44024,0.015673
1,277968,0.004268


In [65]:
result_train_lgb = pd.DataFrame({
    "app_id": x_test.reset_index()['app_id'],
    "flag": x_pred_train_lgb})
result_train_lgb.head(2)

Unnamed: 0,app_id,flag
0,44024,0.017624
1,277968,0.007004


In [66]:
res_train = result_train_cat.copy()
res_train['flag'] = (result_train_cat['flag'] + result_train_xgb['flag'] + result_train_lgb['flag']) / 3
res_train.head(2)

Unnamed: 0,app_id,flag
0,44024,0.016237
1,277968,0.00496


In [67]:
r = roc_auc_score(y_test, res_train['flag'])
r

0.7764552338368833

In [68]:
res_train = result_train_cat.copy()
res_train['flag'] = result_train_cat['flag']*0.4 + result_train_xgb['flag']*0.4 + result_train_lgb['flag']*0.2
res_train.head(2)

Unnamed: 0,app_id,flag
0,44024,0.01596
1,277968,0.004552


In [69]:
r = roc_auc_score(y_test, res_train['flag'])
r

0.7777979665084374

In [72]:
res_train = result_train_cat.copy()
res_train['flag'] = result_train_cat['flag']*0.3 + result_train_xgb['flag']*0.5 + result_train_lgb['flag']*0.2
res_train.head(2)

Unnamed: 0,app_id,flag
0,44024,0.015986
1,277968,0.004618


In [73]:
r = roc_auc_score(y_test, res_train['flag'])
r

0.778748837264803

In [75]:
res_test = result_test_cat.copy()
res_test['flag'] = result_test_cat['flag']*0.3 + result_test_xgb['flag']*0.5 + result_test_lgb['flag']*0.2
res_test.head(2)

Unnamed: 0,app_id,flag
0,1063620,0.015074
1,1063621,0.033471


In [76]:
filename = 'Tefi_1.csv'
res_test.to_csv(filename, index=None)

# 0.741411

In [None]:
stop 2

In [23]:
#x_pred_train_cat = pd.Series(x_pred_train_cat) 
#x_pred_train_xgb = pd.Series(x_pred_train_xgb) 
#x_pred_train_lgb = pd.Series(x_pred_train_lgb) 

In [24]:
#x_pred_train_cat.to_pickle("x_pred_train_cat.pkl")
#x_pred_train_xgb.to_pickle("x_pred_train_xgb.pkl")
#x_pred_train_lgb.to_pickle("x_pred_train_lgb.pkl")

In [27]:
#y_pred_test_cat = pd.Series(y_pred_test_cat) 
#y_pred_test_xgb = pd.Series(y_pred_test_xgb) 
#y_pred_test_lgb = pd.Series(y_pred_test_lgb) 

In [28]:
#y_pred_test_cat.to_pickle("y_pred_test_cat.pkl")
#y_pred_test_xgb.to_pickle("y_pred_test_xgb.pkl")
#y_pred_test_lgb.to_pickle("y_pred_test_lgb.pkl")

In [25]:
#a = pd.read_pickle("x_pred_train_cat.pkl")