In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

import ydata_profiling as pdp

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import lightgbm as lgb

import optuna

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
ap_train = pd.read_csv('application_train.csv')
print(ap_train.shape)
ap_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def reduce_mem_func(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimizatitoin is {:.2f}MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100*(start_mem - end_mem)/start_mem))

    return df

In [4]:
ap_train = reduce_mem_func(ap_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimizatitoin is 92.38MB
Decreased by 67.7%


In [5]:
#異常値の修正
ap_train['DAYS_EMPLOYED'] = ap_train['DAYS_EMPLOYED'].replace(365243,np.nan)

In [6]:
#特徴量生成

#世帯人数あたりの総所得
ap_train['income_div_person'] = ap_train['AMT_INCOME_TOTAL']/ap_train['CNT_FAM_MEMBERS']
#外部スコアの平均値
ap_train['EXIT_SCORE_MEAN'] = ap_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
ap_train['EXIT_SCORE_STD'] = ap_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
#就労期間を年齢で割る
ap_train['DAYS_EMPLOYED_DIV_BIRTH'] = ap_train['DAYS_EMPLOYED']/ap_train['DAYS_BIRTH']
#年金支払い額を所得で割る
ap_train['AMNNNU_div_IINCOME'] = ap_train['AMT_ANNUITY']/ap_train['AMT_INCOME_TOTAL']
#年金支払い学を借入で割る
ap_train['ANNUITY_div_CREDIT'] = ap_train['AMT_ANNUITY']/ap_train['AMT_CREDIT']

In [7]:
pos = pd.read_csv('POS_CASH_balance.csv')
pos = reduce_mem_func(pos)
pos_ohe = pd.get_dummies(pos, columns=['NAME_CONTRACT_STATUS'],dummy_na=True)
pos_ohe_agg = pos_ohe.groupby('SK_ID_CURR').agg(
    {
        #数値の集約
        'MONTHS_BALANCE' : ['mean', 'std', 'min', 'max'],
        'CNT_INSTALMENT' : ['mean', 'std', 'min', 'max'],
        'CNT_INSTALMENT_FUTURE' : ['mean', 'std', 'min', 'max'],
        'SK_DPD' : ['mean', 'std', 'min', 'max'],
        'SK_DPD_DEF' : ['mean', 'std', 'min', 'max'],
        #カテゴリ変数をoheした値の集約
        'NAME_CONTRACT_STATUS_Active' : ['mean'],
        'NAME_CONTRACT_STATUS_Amortized debt' : ['mean'],
        'NAME_CONTRACT_STATUS_Approved' : ['mean'],
        'NAME_CONTRACT_STATUS_Canceled' : ['mean'],
        'NAME_CONTRACT_STATUS_Completed' : ['mean'],
        'NAME_CONTRACT_STATUS_Demand' : ['mean'],
        'NAME_CONTRACT_STATUS_Returned to the store' : ['mean'],
        'NAME_CONTRACT_STATUS_Signed' : ['mean'],
        'NAME_CONTRACT_STATUS_XNA' : ['mean'],
        'NAME_CONTRACT_STATUS_nan' : ['mean'],
        'SK_ID_PREV' : ['count','nunique'],
    }
)
pos_ohe_agg.columns =  [i + '_' + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

Memory usage of dataframe is 610.43 MB
Memory usage after optimizatitoin is 238.45MB
Decreased by 60.9%


In [8]:
df_train = pd.merge(ap_train, pos_ohe_agg, on='SK_ID_CURR',how='left')
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


In [9]:
x_train = df_train.drop(columns =['TARGET','SK_ID_CURR'])
y_train = df_train[['TARGET']]
id_train = df_train[['SK_ID_CURR']]

In [10]:
for col in x_train.columns:
    if x_train[col].dtypes == 'object':
        x_train[col] = x_train[col].astype('category')

In [11]:
#ハイパーパラメータ
params_base = {
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'auc',
    'learning_rate' : 0.2,
    'num_leaves' : 16,
    'n_estimators' : 100000,
    'random_state' : 123,
    'importance_type' : 'gain',
    'bagging_freq' : 1,
    'seed' : 123,
}

In [40]:
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
nfold = 0
idx_tr, idx_va = cv[nfold][0],cv[nfold][1]
x_tr, y_tr, id_tr = x_train.loc[idx_tr,:], y_train.loc[idx_tr,:], id_train.loc[idx_tr,:]
x_va, y_va, id_va = x_train.loc[idx_va,:], y_train.loc[idx_va,:], id_train.loc[idx_va,:]   
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008, 1) (246008, 1)
(61503, 120) (61503, 1) (61503, 1)


In [12]:
#クロスバリデーション　層化分割
#モデルはLightGBM
params_base = {
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'auc',
    'learning_rate' : 0.2,
    'num_leaves' : 16,
    'n_estimators' : 100000,
    'random_state' : 123,
    'importance_type' : 'gain',
    'bagging_freq' : 1,
    'seed' : 123,
}

def objective(trial):

    params_tuning = {
        'num_leaves' : trial.suggest_int('num_leaves', 8, 256),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 200),
        'min_sum_hession_in_leaf' : trial.suggest_float('min_sum_hession_in_leaf', 1e-5, 1e-2, log=True),
        'feature_fraction' : trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction' : trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'lambda_l1' : trial.suggest_float('lambda_l1', 1e-2, 1e2, log=True),
        'lambda_l2' : trial.suggest_float('lambda_l2', 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)
    
    list_metrics = []

    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

    for nfold in np.arange(5):
        idx_tr, idx_va = cv[nfold][0],cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr,:], y_train.loc[idx_tr,:]
        x_va, y_va = x_train.loc[idx_va,:], y_train.loc[idx_va,:]   

        model = lgb.LGBMClassifier(**params_base)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  callbacks=[lgb.early_stopping(stopping_rounds=100)],
                 )
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'wb') as f:
            pickle.dump(model, f, protocol=4)

        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred)
        list_metrics.append(metric_va)

    metrics = np.mean(list_metrics)

    return metrics

In [30]:
metrics, imp = objective()
print(metrics)
imp.sort_values('imp', ascending=False)[:10]

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17576
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 154
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[77]	training's auc: 0.80498	valid_1's auc: 0.769943
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.122841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wi

Unnamed: 0,col,imp,imp_std
47,EXIT_SCORE_MEAN,30868.920236,282.370651
11,ANNUITY_div_CREDIT,3917.466322,430.716782
129,ORGANIZATION_TYPE,3588.149917,269.904274
51,EXT_SOURCE_3,2558.916498,232.379753
49,EXT_SOURCE_1,1268.123594,95.697013
32,DAYS_BIRTH,1254.33954,69.157924
21,CNT_INSTALMENT_FUTURE_mean,1192.744202,111.478846
103,MONTHS_BALANCE_std,1132.460783,109.619372
1,AMT_ANNUITY,1057.557646,138.395935
33,DAYS_EMPLOYED,1018.371297,221.070198


In [13]:
#テストデータにモデルを適用してみる
ap_test = pd.read_csv('application_test.csv')
ap_test = reduce_mem_func(ap_test)

Memory usage of dataframe is 45.00 MB
Memory usage after optimizatitoin is 14.60MB
Decreased by 67.6%


In [14]:
#推論用のデータセット生成
#異常値の修正
ap_test['DAYS_EMPLOYED'] = ap_test['DAYS_EMPLOYED'].replace(365243,np.nan)
#世帯人数あたりの総所得
ap_test['income_div_person'] = ap_test['AMT_INCOME_TOTAL']/ap_test['CNT_FAM_MEMBERS']
#外部スコアの平均値
ap_test['EXIT_SCORE_MEAN'] = ap_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
ap_test['EXIT_SCORE_STD'] = ap_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
#就労期間を年齢で割る
ap_test['DAYS_EMPLOYED_DIV_BIRTH'] = ap_test['DAYS_EMPLOYED']/ap_test['DAYS_BIRTH']
#年金支払い額を所得で割る
ap_test['AMNNNU_div_IINCOME'] = ap_test['AMT_ANNUITY']/ap_test['AMT_INCOME_TOTAL']
#年金支払い学を借入で割る
ap_test['ANNUITY_div_CREDIT'] = ap_test['AMT_ANNUITY']/ap_test['AMT_CREDIT']

In [15]:
df_test = pd.merge(ap_test,pos_ohe_agg,on='SK_ID_CURR',how='left')

In [16]:
x_test = df_test.drop(columns = ['SK_ID_CURR'])
id_test = df_test[['SK_ID_CURR']]

for col in x_test.columns:
    if x_test[col].dtypes == 'object':
        x_test[col] = x_train[col].astype('category')

In [17]:
#推論関数の定義
def predict_lgb(x_test,
                id_test,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(x_test), len(list_nfold)))
    for nfold in list_nfold:
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'rb') as f:
            model = pickle.load(f)
        pred[:,nfold] = model.predict_proba(x_test)[:,1]

    pred = pd.concat([
        id_test,
        pd.DataFrame({'pred':pred.mean(axis=1)}),
    ],axis=1)

    return pred 

In [24]:
test_pred = predict_lgb(x_test,id_test,list_nfold=[0,1,2,3,4],)



In [25]:
test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.065789
1,100005,0.042427
2,100013,0.018108
3,100028,0.05069
4,100038,0.190117


In [28]:
df_submit = test_pred.rename(columns={'pred':'TARGET'})
print(df_submit.shape)
display(df_submit.head())

df_submit.to_csv('submission_final.csv', index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.065789
1,100005,0.042427
2,100013,0.018108
3,100028,0.05069
4,100038,0.190117


In [18]:
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=5,n_jobs=5)

[I 2024-06-14 18:44:41,887] A new study created in memory with name: no-name-2ec36384-4869-4ad4-bf2a-75bdbda8c018


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.435707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17576
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 154
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.305735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17576
[LightGBM] [Info] Numbe

[I 2024-06-14 18:49:53,152] Trial 1 finished with value: 0.7684218417863979 and parameters: {'num_leaves': 177, 'min_data_in_leaf': 175, 'min_sum_hession_in_leaf': 0.0019362430798816397, 'feature_fraction': 0.5879037489958265, 'bagging_fraction': 0.7503127756038535, 'lambda_l1': 0.051464072931375435, 'lambda_l2': 13.267632100585091}. Best is trial 1 with value: 0.7684218417863979.
[I 2024-06-14 18:49:53,403] Trial 0 finished with value: 0.7684218417863979 and parameters: {'num_leaves': 46, 'min_data_in_leaf': 170, 'min_sum_hession_in_leaf': 0.0009349062100225613, 'feature_fraction': 0.582159752053318, 'bagging_fraction': 0.8965513988297739, 'lambda_l1': 92.82395621157131, 'lambda_l2': 0.03012505007998673}. Best is trial 1 with value: 0.7684218417863979.


Early stopping, best iteration is:
[60]	training's auc: 0.797107	valid_1's auc: 0.763188
Early stopping, best iteration is:
[60]	training's auc: 0.797107	valid_1's auc: 0.763188


[I 2024-06-14 18:49:54,766] Trial 2 finished with value: 0.7684218417863979 and parameters: {'num_leaves': 203, 'min_data_in_leaf': 159, 'min_sum_hession_in_leaf': 1.1415102570305824e-05, 'feature_fraction': 0.6079788804136718, 'bagging_fraction': 0.5973535793926341, 'lambda_l1': 0.03866649407962029, 'lambda_l2': 2.3310244687735175}. Best is trial 1 with value: 0.7684218417863979.
[I 2024-06-14 18:49:54,807] Trial 4 finished with value: 0.7684218417863979 and parameters: {'num_leaves': 78, 'min_data_in_leaf': 37, 'min_sum_hession_in_leaf': 0.0017220420034305271, 'feature_fraction': 0.8131832566444874, 'bagging_fraction': 0.6330932338063777, 'lambda_l1': 0.3552249621665319, 'lambda_l2': 68.39556924892919}. Best is trial 1 with value: 0.7684218417863979.


Early stopping, best iteration is:
[60]	training's auc: 0.797107	valid_1's auc: 0.763188


[I 2024-06-14 18:49:59,345] Trial 3 finished with value: 0.7684218417863979 and parameters: {'num_leaves': 209, 'min_data_in_leaf': 113, 'min_sum_hession_in_leaf': 4.13115475082411e-05, 'feature_fraction': 0.7003424629702033, 'bagging_fraction': 0.7837307925024403, 'lambda_l1': 0.010987796952107947, 'lambda_l2': 30.96124476770603}. Best is trial 1 with value: 0.7684218417863979.


In [19]:
trial = study.best_trial
print('acc(best)={:.4f}'.format(trial.value))
display(trial.params)

acc(best)=0.7684


{'num_leaves': 177,
 'min_data_in_leaf': 175,
 'min_sum_hession_in_leaf': 0.0019362430798816397,
 'feature_fraction': 0.5879037489958265,
 'bagging_fraction': 0.7503127756038535,
 'lambda_l1': 0.051464072931375435,
 'lambda_l2': 13.267632100585091}

In [20]:
#最適なハイパーパラメータの取得
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 16,
 'min_data_in_leaf': 175,
 'min_sum_hession_in_leaf': 0.0019362430798816397,
 'feature_fraction': 0.5879037489958265,
 'bagging_fraction': 0.7503127756038535,
 'lambda_l1': 0.051464072931375435,
 'lambda_l2': 13.267632100585091,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.2,
 'n_estimators': 100000,
 'random_state': 123,
 'importance_type': 'gain',
 'bagging_freq': 1,
 'seed': 123}

In [21]:
def train_lgb(x_train,
              y_train,
              id_train,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):

    
    metrics = []
    imp = pd.DataFrame()

    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

    for nfold in np.arange(5):
        idx_tr, idx_va = cv[nfold][0],cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr,:], y_train.loc[idx_tr,:]
        x_va, y_va = x_train.loc[idx_va,:], y_train.loc[idx_va,:]   

        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  callbacks=[lgb.early_stopping(stopping_rounds=100)],
                 )
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'wb') as f:
            pickle.dump(model, f, protocol=4)

        y_tr_pred = model.predict_proba(x_tr)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        imp_fold = pd.DataFrame({'col':x_train.columns, 'imp':model.feature_importances_,'nfold':nfold})
        imp = pd.concat([imp, imp_fold])

    imp = imp.groupby('col')['imp'].agg(['mean', 'std']).reset_index(drop=False)
    imp.columns = ['col','imp','imp_std']


    return metrics,imp

In [23]:
metric, imp = train_lgb(x_train,
              y_train,
              id_train,
              params=params_best,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             )

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.104040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17564
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	training's auc: 0.800316	valid_1's auc: 0.770745
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_w