In [1]:
import os, sys
from typing import Any, List, Tuple
# データ処理
import numpy as np
import pandas as pd
# お絵かき
import matplotlib.pyplot as plt
import seaborn as sns
# モデル
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# 評価値
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score
# 交差検証
from sklearn.model_selection import KFold, StratifiedKFold
# ハイパーパラメターチューニング
import optuna
# ガベージコレクタ
import gc
# 正規表現
import re
# データ形成
from data_shaping import *

In [2]:
train, test = preprocess().get_train_test_data()
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

Data Shaping - done in 5s


In [6]:
def kfold_lightgbm(train_df, num_folds, stratified = False, debug= False):
    # 交差検証
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # 結果蓄積用DataFrame&array
    oof_preds = np.zeros(train_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR']]
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        # LightGBMパラメタ（要調整）
        clf = LGBMClassifier(
            num_thread=8,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.95,
            subsample=0.9,
            max_depth=10,
            reg_alpha=0.04,
            reg_lambda=0.07,
            min_split_gain=0.02,
            min_child_weight=39,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    return feature_importance_df

In [7]:
df = train.append(test) #交差検証のため
feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= True, debug= False)

Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.777926	training's binary_logloss: 0.241078	valid_1's auc: 0.75401	valid_1's binary_logloss: 0.247954
[400]	training's auc: 0.797363	training's binary_logloss: 0.23433	valid_1's auc: 0.760958	valid_1's binary_logloss: 0.245887
[600]	training's auc: 0.811399	training's binary_logloss: 0.229599	valid_1's auc: 0.762668	valid_1's binary_logloss: 0.245382
[800]	training's auc: 0.823289	training's binary_logloss: 0.225405	valid_1's auc: 0.763666	valid_1's binary_logloss: 0.245136
[1000]	training's auc: 0.833782	training's binary_logloss: 0.221624	valid_1's auc: 0.764164	valid_1's binary_logloss: 0.245054
[1200]	training's auc: 0.843145	training's binary_logloss: 0.218163	valid_1's auc: 0.763773	valid_1's binary_logloss: 0.245131
Early stopping, best iteration is:
[1014]	training's auc: 0.834463	training's binary_logloss: 0.221367	valid_1's auc: 0.764224	valid_1's binary_logloss: 0.245039
Fold  1 AUC : 0.7642

In [8]:
# 重要度（importanceが高いほうが重要度が高い）
feat_importance.groupby("feature").mean().sort_values("importance", ascending = False).head(30)

Unnamed: 0_level_0,importance,fold
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
PAYMENT_RATE,2688.8,5.5
EXT_SOURCE_3,1995.7,5.5
EXT_SOURCE_1,1664.6,5.5
EXT_SOURCE_2,1490.9,5.5
DAYS_BIRTH,1412.6,5.5
DAYS_ID_PUBLISH,1061.6,5.5
DAYS_EMPLOYED,995.8,5.5
DAYS_EMPLOYED_PERC,967.4,5.5
DAYS_LAST_PHONE_CHANGE,962.0,5.5
AMT_ANNUITY,889.5,5.5
