In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import joblib

from tqdm._tqdm_notebook import tqdm_notebook

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
data_train = pd.read_csv('train_final.csv', index_col='client_id')
data_test = pd.read_csv('test_final.csv', index_col='client_id')

gender_train = pd.read_csv(os.path.join('.', 'train.csv'), index_col='client_id')
gender_test = pd.read_csv(os.path.join('.', 'test.csv'), index_col='client_id')

In [59]:
# Функции, которыми можно пользоваться для построения классификатора, 
# оценки его результатов и построение прогноза для тестовой части пользователей

# Cross-validation score (среднее значение метрики ROC AUC на тренировочных данных)
def cv_score(params, train, y_true):
    cv_res=xgb.cv(params, xgb.DMatrix(train, y_true),
                  early_stopping_rounds=10, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)
    index_argmax = cv_res['test-auc-mean'].argmax()
    print('Cross-validation, ROC AUC: {:.3f}+-{:.3f}, Trees: {}'.format(cv_res.loc[index_argmax]['test-auc-mean'],
                                                                        cv_res.loc[index_argmax]['test-auc-std'],
                                                                        index_argmax))

from catboost import cv, Pool

def cv_score_catboost(params, train, y_true):
    # Create a Pool object
    pool = Pool(train, y_true)

    # Perform cross-validation
    cv_res = cv(pool, params, fold_count=5, type='Classical', 
                early_stopping_rounds=200, stratified=True, 
                partition_random_seed=0, plot=False, logging_level='Silent')

    # Find the best iteration
    index_argmax = cv_res['test-AUC-mean'].argmax()
    print('Cross-validation, ROC AUC: {:.3f}+-{:.3f}, Trees: {}'.format(cv_res['test-AUC-mean'][index_argmax],
                                                                        cv_res['test-AUC-std'][index_argmax],
                                                                        index_argmax))


# Построение модели + возврат результатов классификации тестовых пользователей
def fit_predict(params, num_trees, train, test, target):
    params['learning_rate'] = params['eta']
    clf = xgb.train(params, xgb.DMatrix(train.values, target, feature_names=list(train.columns)), 
                    num_boost_round=num_trees, maximize=True)
    y_pred = clf.predict(xgb.DMatrix(test.values, feature_names=list(train.columns)))
    submission = pd.DataFrame(index=test.index, data=y_pred, columns=['probability'])
    
    joblib.dump(clf, 'alex_model.pkl')
    return clf, submission

# Отрисовка важности переменных. Важность переменной - количество разбиений выборки, 
# в которых участвует данная переменная. Чем больше - тем она, вероятно, лучше 
def draw_feature_importances(clf, top_k=10):
    plt.figure(figsize=(10, 10))
    
    importances = dict(sorted(clf.get_score().items(), key=lambda x: x[1])[-top_k:])
    y_pos = np.arange(len(importances))
    
    plt.barh(y_pos, list(importances.values()), align='center', color='green')
    plt.yticks(y_pos, importances.keys(), fontsize=12)
    plt.xticks(fontsize=12)
    plt.xlabel('Feature importance', fontsize=15)
    plt.title('Features importances, Sberbank Gender Prediction', fontsize=18)
    plt.ylim(-0.5, len(importances) - 0.5)
    plt.show()

In [13]:
params = {
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    
    'gamma': 0,
    'lambda': 0,
    'alpha': 0,
    'min_child_weight': 0,
    
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx'
}


In [5]:
target = data_train.join(gender_train, how='inner')['gender']
cv_score(params, data_train, target)

Parameters: { "njobs" } are not used.



Cross-validation, ROC AUC: 0.874+-0.006, Trees: 168


In [60]:
# default
target = data_train.join(gender_train, how='inner')['gender']
params_catboost = {
    'learning_rate': 0.03,  
    'depth': 6,                  
    # 'subsample': 0.8,            
    # 'colsample_bylevel': 0.8,    
    'loss_function': 'Logloss', 
    'eval_metric': 'AUC',           
    # 'bootstrap_type': 'Bernoulli'
}
cv_score_catboost(params_catboost, data_train, target)

Cross-validation, ROC AUC: 0.877+-0.008, Trees: 842


In [61]:
target = data_train.join(gender_train, how='inner')['gender']
params_catboost = {
    'learning_rate': 0.01,  
    'depth': 6,                  
    # 'subsample': 0.8,            
    # 'colsample_bylevel': 0.8,    
    'loss_function': 'Logloss', 
    'eval_metric': 'AUC',           
    # 'bootstrap_type': 'Bernoulli'
}
cv_score_catboost(params_catboost, data_train, target)

Cross-validation, ROC AUC: 0.875+-0.009, Trees: 996


In [62]:
target = data_train.join(gender_train, how='inner')['gender']
params_catboost = {
    'learning_rate': 0.03,  
    'depth': 5,                  
    # 'subsample': 0.8,            
    # 'colsample_bylevel': 0.8,    
    'loss_function': 'Logloss', 
    'eval_metric': 'AUC',           
    # 'bootstrap_type': 'Bernoulli'
}
cv_score_catboost(params_catboost, data_train, target)

KeyboardInterrupt: 

In [None]:
target = data_train.join(gender_train, how='inner')['gender']
params_catboost = {
    'learning_rate': 0.03,  
    'depth': 4,                  
    # 'subsample': 0.8,            
    # 'colsample_bylevel': 0.8,    
    'loss_function': 'Logloss', 
    'eval_metric': 'AUC',           
    # 'bootstrap_type': 'Bernoulli'
}
cv_score_catboost(params_catboost, data_train, target)

In [None]:
target = data_train.join(gender_train, how='inner')['gender']
params_catboost = {
    'learning_rate': 0.03,  
    'depth': 4,                  
    # 'subsample': 0.8,            
    # 'colsample_bylevel': 0.8,    
    'loss_function': 'CrossEntropy', 
    'eval_metric': 'AUC',           
    # 'bootstrap_type': 'Bernoulli'
}
cv_score_catboost(params_catboost, data_train, target)

In [30]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import catboost as cb

def cv_stacked_model(X, y, xgb_params, cat_params, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    stack_preds = np.zeros(y.shape)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        xgb_model = xgb.XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train)

        cat_model = cb.CatBoostClassifier(**cat_params)
        cat_model.fit(X_train, y_train)

        xgb_preds = xgb_model.predict_proba(X_test)[:, 1]
        cat_preds = cat_model.predict_proba(X_test)[:, 1]

        stack_preds[test_index] = (xgb_preds + cat_preds) / 2

    return stack_preds


xgb_params = params
cat_params = params_catboost

predictions = cv_stacked_model(data_train, target, xgb_params, cat_params)


  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 15.9ms	remaining: 15.9s
1:	total: 30.5ms	remaining: 15.2s
2:	total: 44.4ms	remaining: 14.8s
3:	total: 58.1ms	remaining: 14.5s
4:	total: 72ms	remaining: 14.3s
5:	total: 86.5ms	remaining: 14.3s
6:	total: 101ms	remaining: 14.3s
7:	total: 114ms	remaining: 14.2s
8:	total: 130ms	remaining: 14.3s
9:	total: 144ms	remaining: 14.2s
10:	total: 158ms	remaining: 14.2s
11:	total: 172ms	remaining: 14.2s
12:	total: 186ms	remaining: 14.1s
13:	total: 201ms	remaining: 14.2s
14:	total: 215ms	remaining: 14.1s
15:	total: 231ms	remaining: 14.2s
16:	total: 246ms	remaining: 14.2s
17:	total: 262ms	remaining: 14.3s
18:	total: 280ms	remaining: 14.4s
19:	total: 296ms	remaining: 14.5s
20:	total: 312ms	remaining: 14.6s
21:	total: 329ms	remaining: 14.6s
22:	total: 346ms	remaining: 14.7s
23:	total: 362ms	remaining: 14.7s
24:	total: 378ms	remaining: 14.7s
25:	total: 392ms	remaining: 14.7s
26:	total: 406ms	remaining: 14.6s
27:	total: 421ms	remaining: 14.6s
28:	total: 436ms	remaining: 14.6s
29:	total: 453ms	rem

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 16.6ms	remaining: 16.6s
1:	total: 31.6ms	remaining: 15.8s
2:	total: 46.7ms	remaining: 15.5s
3:	total: 61.1ms	remaining: 15.2s
4:	total: 75.6ms	remaining: 15.1s
5:	total: 90.1ms	remaining: 14.9s
6:	total: 105ms	remaining: 14.9s
7:	total: 119ms	remaining: 14.8s
8:	total: 133ms	remaining: 14.7s
9:	total: 147ms	remaining: 14.6s
10:	total: 161ms	remaining: 14.5s
11:	total: 176ms	remaining: 14.5s
12:	total: 190ms	remaining: 14.4s
13:	total: 205ms	remaining: 14.4s
14:	total: 220ms	remaining: 14.4s
15:	total: 236ms	remaining: 14.5s
16:	total: 253ms	remaining: 14.7s
17:	total: 271ms	remaining: 14.8s
18:	total: 287ms	remaining: 14.8s
19:	total: 302ms	remaining: 14.8s
20:	total: 317ms	remaining: 14.8s
21:	total: 332ms	remaining: 14.8s
22:	total: 347ms	remaining: 14.7s
23:	total: 362ms	remaining: 14.7s
24:	total: 377ms	remaining: 14.7s
25:	total: 391ms	remaining: 14.6s
26:	total: 407ms	remaining: 14.7s
27:	total: 422ms	remaining: 14.6s
28:	total: 439ms	remaining: 14.7s
29:	total: 481ms	r

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 16.2ms	remaining: 16.2s
1:	total: 31.5ms	remaining: 15.7s
2:	total: 46.6ms	remaining: 15.5s
3:	total: 60.6ms	remaining: 15.1s
4:	total: 74.8ms	remaining: 14.9s
5:	total: 89.4ms	remaining: 14.8s
6:	total: 104ms	remaining: 14.7s
7:	total: 118ms	remaining: 14.7s
8:	total: 132ms	remaining: 14.6s
9:	total: 148ms	remaining: 14.6s
10:	total: 163ms	remaining: 14.6s
11:	total: 177ms	remaining: 14.5s
12:	total: 191ms	remaining: 14.5s
13:	total: 205ms	remaining: 14.5s
14:	total: 220ms	remaining: 14.5s
15:	total: 237ms	remaining: 14.6s
16:	total: 255ms	remaining: 14.7s
17:	total: 274ms	remaining: 14.9s
18:	total: 292ms	remaining: 15.1s
19:	total: 306ms	remaining: 15s
20:	total: 321ms	remaining: 14.9s
21:	total: 337ms	remaining: 15s
22:	total: 352ms	remaining: 14.9s
23:	total: 368ms	remaining: 15s
24:	total: 382ms	remaining: 14.9s
25:	total: 398ms	remaining: 14.9s
26:	total: 413ms	remaining: 14.9s
27:	total: 430ms	remaining: 14.9s
28:	total: 447ms	remaining: 15s
29:	total: 465ms	remaining

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 29.7ms	remaining: 29.7s
1:	total: 56.4ms	remaining: 28.2s
2:	total: 80.6ms	remaining: 26.8s
3:	total: 105ms	remaining: 26.3s
4:	total: 132ms	remaining: 26.2s
5:	total: 156ms	remaining: 25.8s
6:	total: 179ms	remaining: 25.5s
7:	total: 203ms	remaining: 25.2s
8:	total: 230ms	remaining: 25.3s
9:	total: 260ms	remaining: 25.8s
10:	total: 289ms	remaining: 25.9s
11:	total: 315ms	remaining: 25.9s
12:	total: 340ms	remaining: 25.8s
13:	total: 366ms	remaining: 25.7s
14:	total: 392ms	remaining: 25.7s
15:	total: 417ms	remaining: 25.6s
16:	total: 445ms	remaining: 25.7s
17:	total: 488ms	remaining: 26.6s
18:	total: 514ms	remaining: 26.6s
19:	total: 540ms	remaining: 26.5s
20:	total: 565ms	remaining: 26.3s
21:	total: 589ms	remaining: 26.2s
22:	total: 614ms	remaining: 26.1s
23:	total: 637ms	remaining: 25.9s
24:	total: 665ms	remaining: 25.9s
25:	total: 693ms	remaining: 25.9s
26:	total: 723ms	remaining: 26s
27:	total: 752ms	remaining: 26.1s
28:	total: 788ms	remaining: 26.4s
29:	total: 817ms	remain

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 32.5ms	remaining: 32.4s
1:	total: 57.4ms	remaining: 28.7s
2:	total: 83ms	remaining: 27.6s
3:	total: 107ms	remaining: 26.7s
4:	total: 131ms	remaining: 26s
5:	total: 157ms	remaining: 26s
6:	total: 183ms	remaining: 25.9s
7:	total: 208ms	remaining: 25.8s
8:	total: 236ms	remaining: 25.9s
9:	total: 265ms	remaining: 26.3s
10:	total: 292ms	remaining: 26.2s
11:	total: 321ms	remaining: 26.4s
12:	total: 345ms	remaining: 26.2s
13:	total: 369ms	remaining: 26s
14:	total: 394ms	remaining: 25.8s
15:	total: 419ms	remaining: 25.8s
16:	total: 444ms	remaining: 25.6s
17:	total: 472ms	remaining: 25.7s
18:	total: 502ms	remaining: 25.9s
19:	total: 527ms	remaining: 25.8s
20:	total: 554ms	remaining: 25.8s
21:	total: 580ms	remaining: 25.8s
22:	total: 606ms	remaining: 25.7s
23:	total: 632ms	remaining: 25.7s
24:	total: 670ms	remaining: 26.1s
25:	total: 700ms	remaining: 26.2s
26:	total: 726ms	remaining: 26.2s
27:	total: 752ms	remaining: 26.1s
28:	total: 778ms	remaining: 26s
29:	total: 804ms	remaining: 26s

In [32]:
from sklearn.metrics import roc_auc_score


roc_auc_score(target, predictions)

0.8762966119006955

In [33]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import catboost as cb

def cv_stacked_model(X, y, xgb_params, cat_params, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    stack_preds = np.zeros(y.shape)
    catboost_preds = np.zeros(y.shape)
    xgboost_preds = np.zeros(y.shape)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        xgb_model = xgb.XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train)

        cat_model = cb.CatBoostClassifier(**cat_params)
        cat_model.fit(X_train, y_train)

        xgb_preds = xgb_model.predict_proba(X_test)[:, 1]
        cat_preds = cat_model.predict_proba(X_test)[:, 1]

        stack_preds[test_index] = (xgb_preds + cat_preds) / 2
        catboost_preds[test_index] = cat_preds
        xgboost_preds[test_index] = xgb_preds
        
    return {'stack_preds': stack_preds, 'catboost_preds': catboost_preds, 'xgboost_preds': xgboost_preds}


xgb_params = params
cat_params = params_catboost

predictions_dct = cv_stacked_model(data_train, target, xgb_params, cat_params)


  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 34.2ms	remaining: 34.1s
1:	total: 73.3ms	remaining: 36.6s
2:	total: 103ms	remaining: 34.2s
3:	total: 135ms	remaining: 33.5s
4:	total: 163ms	remaining: 32.3s
5:	total: 194ms	remaining: 32.2s
6:	total: 231ms	remaining: 32.8s
7:	total: 262ms	remaining: 32.5s
8:	total: 294ms	remaining: 32.4s
9:	total: 332ms	remaining: 32.9s
10:	total: 362ms	remaining: 32.6s
11:	total: 392ms	remaining: 32.3s
12:	total: 421ms	remaining: 32s
13:	total: 459ms	remaining: 32.3s
14:	total: 491ms	remaining: 32.3s
15:	total: 523ms	remaining: 32.1s
16:	total: 558ms	remaining: 32.3s
17:	total: 591ms	remaining: 32.2s
18:	total: 620ms	remaining: 32s
19:	total: 647ms	remaining: 31.7s
20:	total: 675ms	remaining: 31.5s
21:	total: 706ms	remaining: 31.4s
22:	total: 747ms	remaining: 31.7s
23:	total: 781ms	remaining: 31.7s
24:	total: 819ms	remaining: 32s
25:	total: 853ms	remaining: 31.9s
26:	total: 884ms	remaining: 31.8s
27:	total: 917ms	remaining: 31.8s
28:	total: 964ms	remaining: 32.3s
29:	total: 1s	remaining: 32.

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 118ms	remaining: 1m 57s
1:	total: 182ms	remaining: 1m 30s
2:	total: 266ms	remaining: 1m 28s
3:	total: 339ms	remaining: 1m 24s
4:	total: 386ms	remaining: 1m 16s
5:	total: 419ms	remaining: 1m 9s
6:	total: 458ms	remaining: 1m 4s
7:	total: 485ms	remaining: 1m
8:	total: 515ms	remaining: 56.7s
9:	total: 545ms	remaining: 54s
10:	total: 582ms	remaining: 52.3s
11:	total: 613ms	remaining: 50.5s
12:	total: 641ms	remaining: 48.7s
13:	total: 666ms	remaining: 46.9s
14:	total: 693ms	remaining: 45.5s
15:	total: 719ms	remaining: 44.2s
16:	total: 744ms	remaining: 43s
17:	total: 773ms	remaining: 42.2s
18:	total: 812ms	remaining: 41.9s
19:	total: 841ms	remaining: 41.2s
20:	total: 868ms	remaining: 40.5s
21:	total: 895ms	remaining: 39.8s
22:	total: 921ms	remaining: 39.1s
23:	total: 946ms	remaining: 38.5s
24:	total: 972ms	remaining: 37.9s
25:	total: 999ms	remaining: 37.4s
26:	total: 1.02s	remaining: 37s
27:	total: 1.05s	remaining: 36.6s
28:	total: 1.08s	remaining: 36.2s
29:	total: 1.1s	remaining: 3

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 97.1ms	remaining: 1m 36s
1:	total: 167ms	remaining: 1m 23s
2:	total: 279ms	remaining: 1m 32s
3:	total: 351ms	remaining: 1m 27s
4:	total: 382ms	remaining: 1m 16s
5:	total: 414ms	remaining: 1m 8s
6:	total: 455ms	remaining: 1m 4s
7:	total: 486ms	remaining: 1m
8:	total: 514ms	remaining: 56.6s
9:	total: 551ms	remaining: 54.6s
10:	total: 585ms	remaining: 52.6s
11:	total: 615ms	remaining: 50.7s
12:	total: 655ms	remaining: 49.7s
13:	total: 683ms	remaining: 48.1s
14:	total: 710ms	remaining: 46.6s
15:	total: 736ms	remaining: 45.2s
16:	total: 763ms	remaining: 44.1s
17:	total: 793ms	remaining: 43.2s
18:	total: 823ms	remaining: 42.5s
19:	total: 851ms	remaining: 41.7s
20:	total: 879ms	remaining: 41s
21:	total: 916ms	remaining: 40.7s
22:	total: 943ms	remaining: 40.1s
23:	total: 970ms	remaining: 39.5s
24:	total: 1s	remaining: 39s
25:	total: 1.03s	remaining: 38.6s
26:	total: 1.06s	remaining: 38.1s
27:	total: 1.08s	remaining: 37.7s
28:	total: 1.11s	remaining: 37.3s
29:	total: 1.15s	remaining: 

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 46.4ms	remaining: 46.3s
1:	total: 78.1ms	remaining: 39s
2:	total: 113ms	remaining: 37.7s
3:	total: 162ms	remaining: 40.2s
4:	total: 189ms	remaining: 37.6s
5:	total: 236ms	remaining: 39s
6:	total: 261ms	remaining: 37s
7:	total: 287ms	remaining: 35.5s
8:	total: 312ms	remaining: 34.3s
9:	total: 337ms	remaining: 33.4s
10:	total: 365ms	remaining: 32.8s
11:	total: 392ms	remaining: 32.3s
12:	total: 428ms	remaining: 32.5s
13:	total: 472ms	remaining: 33.3s
14:	total: 517ms	remaining: 33.9s
15:	total: 568ms	remaining: 35s
16:	total: 619ms	remaining: 35.8s
17:	total: 671ms	remaining: 36.6s
18:	total: 716ms	remaining: 37s
19:	total: 746ms	remaining: 36.5s
20:	total: 773ms	remaining: 36s
21:	total: 800ms	remaining: 35.6s
22:	total: 826ms	remaining: 35.1s
23:	total: 853ms	remaining: 34.7s
24:	total: 879ms	remaining: 34.3s
25:	total: 907ms	remaining: 34s
26:	total: 950ms	remaining: 34.2s
27:	total: 996ms	remaining: 34.6s
28:	total: 1.03s	remaining: 34.6s
29:	total: 1.07s	remaining: 34.7s
30

  y_train, y_test = y[train_index], y[test_index]
Parameters: { "njobs" } are not used.



0:	total: 30.9ms	remaining: 30.9s
1:	total: 58.8ms	remaining: 29.3s
2:	total: 82.8ms	remaining: 27.5s
3:	total: 107ms	remaining: 26.6s
4:	total: 131ms	remaining: 26s
5:	total: 155ms	remaining: 25.6s
6:	total: 179ms	remaining: 25.4s
7:	total: 204ms	remaining: 25.2s
8:	total: 227ms	remaining: 25s
9:	total: 252ms	remaining: 25s
10:	total: 281ms	remaining: 25.2s
11:	total: 307ms	remaining: 25.2s
12:	total: 332ms	remaining: 25.2s
13:	total: 356ms	remaining: 25.1s
14:	total: 381ms	remaining: 25s
15:	total: 406ms	remaining: 25s
16:	total: 436ms	remaining: 25.2s
17:	total: 463ms	remaining: 25.3s
18:	total: 488ms	remaining: 25.2s
19:	total: 517ms	remaining: 25.3s
20:	total: 545ms	remaining: 25.4s
21:	total: 571ms	remaining: 25.4s
22:	total: 599ms	remaining: 25.4s
23:	total: 627ms	remaining: 25.5s
24:	total: 663ms	remaining: 25.9s
25:	total: 690ms	remaining: 25.9s
26:	total: 718ms	remaining: 25.9s
27:	total: 747ms	remaining: 25.9s
28:	total: 775ms	remaining: 26s
29:	total: 803ms	remaining: 26s
3

In [49]:
new_predictions = predictions_dct['catboost_preds'] * 0.8 + predictions_dct['xgboost_preds'] * 0.2
# new_predictions = predictions_dct['stack_preds']
roc_auc_score(target, new_predictions)

0.8772786255125564

In [None]:
# import lightgbm as lgb
# import numpy as np
# 
# def cv_score_lgbm(params, train, y_true):
#     # Prepare dataset
#     lgb_data = lgb.Dataset(train, label=y_true)
# 
#     # Perform cross-validation
#     cv_res = lgb.cv(params, lgb_data, num_boost_round=500, nfold=5,
#                     stratified=True, 
#                     metrics='auc', seed=0)
# 
#     # Best number of iterations and corresponding score
#     best_nrounds = np.argmax(cv_res['auc-mean'])  # The round with the best mean AUC
#     best_score = cv_res['auc-mean'][best_nrounds]
#     std_dev = cv_res['auc-stdv'][best_nrounds]
#     print(f'Cross-validation, ROC AUC: {best_score:.3f}+-{std_dev:.3f}, Trees: {best_nrounds}')
# 
# # Example usage
# lgbm_params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     # Other parameters: learning_rate, num_leaves, etc.
# }
# 
# cv_score_lgbm(lgbm_params, data_train, target)