In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')


dir_path = "/Users/yuki.tatsuoka/Downloads/home-credit-default-risk (1)/"

app_train = pd.read_csv(dir_path + "application_train.csv")
app_test = pd.read_csv(dir_path + "application_test.csv")

pd.set_option("display.max_rows", 300)
np.set_printoptions(threshold=30)

In [2]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
app_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [4]:
# クロスバリデーションまでいきたいのでEDA割愛
# train,testデータのカラムが異なるので整える
y = app_train["TARGET"]

# TARGETを削除
app_train = app_train.drop("TARGET", axis=1)

In [5]:
# train,testデータを統合する
df = pd.concat([app_train, app_test], axis=0)
df.shape

(356255, 121)

In [6]:
# 数値データと、object型のデータの欠損値を確認する
missing = df.isnull().sum()
missing = missing[missing > 0]
missing

AMT_ANNUITY                         36
AMT_GOODS_PRICE                    278
NAME_TYPE_SUITE                   2203
OWN_CAR_AGE                     235241
OCCUPATION_TYPE                 111996
CNT_FAM_MEMBERS                      2
EXT_SOURCE_1                    193910
EXT_SOURCE_2                       668
EXT_SOURCE_3                     69633
APARTMENTS_AVG                  179948
BASEMENTAREA_AVG                207584
YEARS_BEGINEXPLUATATION_AVG     172863
YEARS_BUILD_AVG                 236306
COMMONAREA_AVG                  248360
ELEVATORS_AVG                   189080
ENTRANCES_AVG                   178407
FLOORSMAX_AVG                   176341
FLOORSMIN_AVG                   241108
LANDAREA_AVG                    210844
LIVINGAPARTMENTS_AVG            242979
LIVINGAREA_AVG                  177902
NONLIVINGAPARTMENTS_AVG         246861
NONLIVINGAREA_AVG               195766
APARTMENTS_MODE                 179948
BASEMENTAREA_MODE               207584
YEARS_BEGINEXPLUATATION_M

In [7]:
# 数値型とオブジェクトを分断する
quantitve = [x for x in df.columns if df.dtypes[x] != object]
objective = [x for x in df.columns if df.dtypes[x] == object]

# 数値型には平均値、object型には最頻値
for i in quantitve:
    df[i].fillna(0, inplace=True)

# modeのみSeriesとDataFrameでは挙動が異なるので, ilocで指定してndarray型に変換する
for i in objective:
    df[i].fillna('NA', inplace=True)

In [8]:
missing.isnull().sum().sum()

0

In [9]:
# get_dummiesでobject型をワンホットで埋める
df = pd.get_dummies(df)
df.shape

(356255, 251)

In [10]:
# trainデータとテストデータに分割する
new_app_train = df.iloc[:app_train.shape[0], :]
new_app_test = df.iloc[app_train.shape[0]:, :]

# 
print("結合前のサイズ{}, {}".format(app_train.shape, app_test.shape))
print("結合後のサイズ{}, {}".format(new_app_train.shape, new_app_test.shape))

結合前のサイズ(307511, 121), (48744, 121)
結合後のサイズ(307511, 251), (48744, 251)


# 【問題1】クロスバリデーション
事前学習期間では検証データをはじめに分割しておき、それに対して指標値を計算することで検証を行っていました。（ホールドアウト法）しかし、分割の仕方により精度は変化します。実践的には クロスバリデーション（交差検証） を行います。分割を複数回行い、それぞれに対して学習と検証を行う方法です。複数回の分割のためにscikit-learnにはKFoldクラスが用意されています。


事前学習期間の課題で作成したベースラインモデルに対してKFoldクラスによるクロスバリデーションを行うコードを作成し実行してください。



In [11]:
# クロスバリデーションを使う
# kfoldを使ってみる
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split

kf = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kf.split(new_app_train, y):
    print(train_index)
    print(test_index)
    print(train_index.shape)
    print(test_index.shape)
    print("-"*50)

[     4      5      6 ... 307507 307508 307509]
[     0      1      2 ... 307490 307506 307510]
(246008,)
(61503,)
--------------------------------------------------
[     0      1      2 ... 307508 307509 307510]
[     5      6     11 ... 307494 307497 307505]
(246009,)
(61502,)
--------------------------------------------------
[     0      1      2 ... 307508 307509 307510]
[     9     10     18 ... 307488 307498 307500]
(246009,)
(61502,)
--------------------------------------------------
[     0      1      2 ... 307505 307506 307510]
[     7     21     22 ... 307507 307508 307509]
(246009,)
(61502,)
--------------------------------------------------
[     0      1      2 ... 307508 307509 307510]
[     4      8     19 ... 307495 307496 307502]
(246009,)
(61502,)
--------------------------------------------------


In [12]:
# train,testデータに分ける
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

for train_index, test_index in kf.split(new_app_train, y):
    X_train1, X_test1 = new_app_train.iloc[train_index], new_app_train.iloc[test_index]
    y_train1, y_test1 = y.iloc[train_index], y.iloc[test_index]

In [51]:
# stratifiedKFoldも実施してみる
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=22)

model_LR = LogisticRegression()

for train_index, test_index in skf.split(new_app_train, y):
    X_train2 = new_app_train.iloc[train_index, :]
    X_test2 = new_app_train.iloc[test_index, :]
    y_train2 = y.iloc[train_index]
    y_test2 = y.iloc[test_index]
    
    # 学習
    model_LR.fit(X_train2, y_train2)
    pred = model_LR.predict_proba(X_test2)[:, 1]
    print(roc_auc_score(y_test2,pred))

0.6211080475382352
0.6120328983203771
0.620333952473957


In [14]:
'''
X_train = {}
X_test = {}

for i, (train_index, test_index) in enumerate(skf.split(new_app_train, y)):
    X_train[i] = new_app_train.loc[train_index]
    X_test[i] = new_app_train.iloc[test_index]
'''


'\nX_train = {}\nX_test = {}\n\nfor i, (train_index, test_index) in enumerate(skf.split(new_app_train, y)):\n    X_train[i] = new_app_train.loc[train_index]\n    X_test[i] = new_app_train.iloc[test_index]\n'

In [15]:
# cross_val_scoreでもスコアリング 
scores = cross_val_score(model_LR, X=new_app_train, y=y, cv=5, scoring='roc_auc')

print("分割した各スコア{}".format(scores))
scores[0]

分割した各スコア[0.62171145 0.61912683 0.62096088 0.61749134 0.62347683]


0.6217114516675628

In [16]:
from sklearn import metrics

metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [17]:
# 平均化 
score_mean = scores.mean()
score_mean

0.6205534681377224

#  【問題2】グリッドサーチ
これまで分類器のパラメータには触れず、デフォルトの設定を使用していました。パラメータの詳細は今後のSprintで学んでいくことになります。機械学習の前提として、パラメータは状況に応じて最適なものを選ぶ必要があります。最適なパラメータを探していくことを パラメータチューニング と呼びます。パラメータチューニングをある程度自動化する単純な方法としては グリッドサーチ があります。


scikit-learnのGridSearchCVを使い、グリッドサーチを行うコードを作成してください。そして、ベースラインモデルに対して何らかしらのパラメータチューニングを行なってください。どのパラメータをチューニングするかは、使用した手法の公式ドキュメントを参考にしてください。


sklearn.model_selection.GridSearchCV — scikit-learn 0.21.3 documentation


GridSearchCVクラスには引数としてモデル、探索範囲、さらにクロスバリデーションを何分割で行うかを与えます。クロスバリデーションの機能も含まれているため、これを使用する場合はKFoldクラスを利用する必要はありません


In [18]:
# ロジスティック回帰
from sklearn.model_selection import GridSearchCV
model_LR = LogisticRegression()

params = {"C":[0.01, 0.1, 1.0, 5, 10], "penalty":["l1", "l2"]}

grid = GridSearchCV(model_LR, params,  scoring='roc_auc')
grid.fit(X_train2, y_train2)

print('best parameter{}'.format(grid.best_params_))
print('best parameter{}'.format(grid.best_score_))

best parameter{'C': 1.0, 'penalty': 'l2'}
best parameter0.6178975899180583


In [53]:
model_LR = LogisticRegression(penalty='l2', C=10)

for train_index, test_index in skf.split(new_app_train, y):
    X_train3 = new_app_train.iloc[train_index]
    X_test3 = new_app_train.iloc[test_index]
    y_train3 = y.iloc[train_index]
    y_test3 = y.iloc[test_index]
    
    # 学習
    model_LR.fit(X_train3, y_train3)
    pred = model_LR.predict_proba(X_test3)[:, 1]
    print(roc_auc_score(y_test3, pred))
    
# ROC曲線 結果
# グリッドサーチ前:
0.6211080475382352
0.6120328983203771
0.620333952473957

# グリッドサーチ後:
0.621355624638684
0.613304807767437
0.6197293470988348

'''
考察
最後以外は微妙なれどグリッドサーチによって改善されている
'''

0.621355624638684
0.613304807767437
0.6197293470988348


0.4596345401508278

In [21]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# JSONエラーが出たので追加
# https://www.kaggle.com/c/data-science-bowl-2019/discussion/122021
X_train3.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train3.columns]
X_test3.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test3.columns]

# モデル選定 
model = lgb.LGBMClassifier()

# 学習データの用意
train = lgb.Dataset(X_train3, y_train3) 
test = lgb.Dataset(X_test3, y_test3, reference=train)

params = {'learning_rate': [0.01, 0.05, 0.1, 1], 'n_estimators': [100,200,500]}
grid = GridSearchCV(model, params, cv=5)
grid.fit(X_train3, y_train3)

# 評価
print(grid.best_params_)
print(grid.best_score_)

# gridserchの評価
# ロジスティック回帰：0.9195598535620721
# lightgbm：             0.9192549866669385


{'learning_rate': 0.01, 'n_estimators': 500}
0.9194616783906214


In [22]:
print(X_train3.shape, X_test3.shape)
print(y_train3.shape, y_test3.shape)

(205008, 251) (102503, 251)
(205008,) (102503,)


In [25]:
# パラーメータの調整
params_algo = {
                'task': 'train',
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric':'auc',
                'learning_rate':0.01,
                'n_estimators':500,
            }

# 学習と推定 
model = lgb.train(params_algo, train, valid_sets=test)

[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11734
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432489
[LightGBM] [Info] Start training from score -2.432489
[1]	valid_0's auc: 0.705878
[2]	valid_0's auc: 0.708714
[3]	valid_0's auc: 0.71015
[4]	valid_0's auc: 0.711542
[5]	valid_0's auc: 0.712361
[6]	valid_0's auc: 0.712901
[7]	valid_0's auc: 0.712995
[8]	valid_0's auc: 0.713271
[9]	valid_0's auc: 0.713314
[10]	valid_0's auc: 0.714633
[11]	valid_0's auc: 0.714808
[12]	valid_0's auc: 0.715216
[13]	valid_0's auc: 0.716489
[14]	valid_0's auc: 0.716874
[15]	valid_0's auc: 0.717212
[16]	valid_0's auc: 0.717526
[17]	valid_0's auc: 0.717781
[18]	valid_0's auc: 0.718328
[19]	valid_0's auc: 0.719031
[

[262]	valid_0's auc: 0.749677
[263]	valid_0's auc: 0.749756
[264]	valid_0's auc: 0.749848
[265]	valid_0's auc: 0.749877
[266]	valid_0's auc: 0.749966
[267]	valid_0's auc: 0.75
[268]	valid_0's auc: 0.750032
[269]	valid_0's auc: 0.750088
[270]	valid_0's auc: 0.750178
[271]	valid_0's auc: 0.750233
[272]	valid_0's auc: 0.750316
[273]	valid_0's auc: 0.750366
[274]	valid_0's auc: 0.750391
[275]	valid_0's auc: 0.750468
[276]	valid_0's auc: 0.750549
[277]	valid_0's auc: 0.750593
[278]	valid_0's auc: 0.750638
[279]	valid_0's auc: 0.750715
[280]	valid_0's auc: 0.750769
[281]	valid_0's auc: 0.750808
[282]	valid_0's auc: 0.750832
[283]	valid_0's auc: 0.75085
[284]	valid_0's auc: 0.750921
[285]	valid_0's auc: 0.750991
[286]	valid_0's auc: 0.751066
[287]	valid_0's auc: 0.751109
[288]	valid_0's auc: 0.751149
[289]	valid_0's auc: 0.751243
[290]	valid_0's auc: 0.751261
[291]	valid_0's auc: 0.751327
[292]	valid_0's auc: 0.751353
[293]	valid_0's auc: 0.751428
[294]	valid_0's auc: 0.751535
[295]	valid_0's

# 【問題3】Kaggle Notebooksからの調査
KaggleのNotebooksから様々なアイデアを見つけ出して、列挙してください。





### クロスバリデーション種類
•k分割交差検定<br>
•層化k分割交差検定<br>
•ホールドアウトベースの検証<br>
•リーブワンアウト交差検定→データの中から1つのみ抽出<br>
•グループk分割交差検定→グループに選びたいものだけ抽出する？<br>

 ### for使い繰り返し処理でクロスバリデーションを行なっている
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x,  = train_df[feats].iloc[train_idx], 
        train_y = train_df['target'].iloc[train_idx]
        valid_x = train_df[feats].iloc[valid_idx]
        valid_y = train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,label=train_y,free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,label=valid_y,free_raw_data=False)

        # params optimized by optuna
        params ={
                        'task': 'train',
                        'boosting': 'goss',
                        'objective': 'regression',
                        'metric': 'rmse',
                        'learning_rate': 0.005,
                        'subsample': 0.9855232997390695,
                        'max_depth': 8,
                        'top_rate': 0.9064148448434349,
                        'num_leaves': 87,
                        'min_child_weight': 41.9612869171337,
                        'other_rate': 0.0721768246018207,
                        'reg_alpha': 9.677537745007898,
                        'colsample_bytree': 0.5665320670155495,
                        'min_split_gain': 9.820197773625843,
                        'reg_lambda': 8.2532317400459,
                        'min_data_in_leaf': 21,
                        'verbose': -1,
                        'seed':int(2**n_fold),
                        'bagging_seed':int(2**n_fold),
                        'drop_seed':int(2**n_fold)
                        }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

### グリッドサーチ
%%time
parameters = {
    'max_depth': [3, 5, 7, 9], 
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier(
    random_state=SEED,
)

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb.best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb.best_score_
print('-----')
-----
Best parameters {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50}
Mean cross-validated accuracy score of the best_estimator: 0.848
-----
CPU times: user 16.1 s, sys: 217 ms, total: 16.3 s
Wall time: 16.3 s

# 【問題4】高い汎化性能のモデル作成
問題3で見つけたアイデアと、独自のアイデアを組み合わせ高い汎化性能のモデル作りを進めてください。


その過程として、何を行うことで、クロスバリデーションの結果がどの程度変化したかを表にまとめてください。


In [106]:
# XGBMでクロスバリデーションなしのスコアを確認する
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold


X_train, X_test, y_train, y_test = train_test_split(new_app_train, y, random_state=22)

model = xgb.XGBClassifier()

score_list1 = []

#train、testデータ作成〜予測
model = xgb.XGBClassifier(silent=1)
model.fit(X_train, y_train)

pred = model.predict_proba(X_test)[:, 1]
scores = roc_auc_score(y_test, pred)

score_list1.append(scores)
print(scores)
    

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.7447646898536284


In [132]:
X_train.shape

(230633, 251)

In [107]:
# lightGBMをfor文で学習の予定だったが、エラー修正が不可だったので
# ひとまずXGBoostに変更する
# 今回はそのままクロスバリデーションで学習する
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold


skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=22)
score_list2 = []

for i, (train_index, test_index) in enumerate(skf.split(new_app_train, y)):
    X_train4 = new_app_train.loc[train_index]
    y_train4 = y.iloc[train_index]
    X_test4 = new_app_train.iloc[test_index]
    y_test4 = y.iloc[test_index]
    
    #train、testデータ作成〜予測
    model = xgb.XGBClassifier(silent=1)
    model.fit(X_train4, y_train4)
    pred = model.predict_proba(X_test4)[:, 1]
    scores = roc_auc_score( y_test4, pred)
    score_list2.append(scores)
    print(scores)


'''
lightgbmの処理
    
    train = lgb.Dataset(X_train4, label=y_train4, free_raw_data=False)
    test = lgb.Dataset(y_train4, label=y_test4, reference=train, free_raw_data=False)
    
    params = {'task': 'train',
             'boosting_type': 'gbdt',
              'objective': 'binary',
              'metric':'auc',
              'learning_rate':0.05,
              'n_estimators':100,
    }
    
    model = lgb.train(params, train, 
                      valid_sets=test, 
                      num_boost_round=5000, 
                      early_stopping_rounds= 200,
                      verbose_eval=100)

    model.predict(X_test4,num_iteration=model.best_iteration)
    
 '''

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.7474700365975426
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.7423615342952354
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.7514631708370274


"\nlightgbmの処理\n    \n    train = lgb.Dataset(X_train4, label=y_train4, free_raw_data=False)\n    test = lgb.Dataset(y_train4, label=y_test4, reference=train, free_raw_data=False)\n    \n    params = {'task': 'train',\n             'boosting_type': 'gbdt',\n              'objective': 'binary',\n              'metric':'auc',\n              'learning_rate':0.05,\n              'n_estimators':100,\n    }\n    \n    model = lgb.train(params, train, \n                      valid_sets=test, \n                      num_boost_round=5000, \n                      early_stopping_rounds= 200,\n                      verbose_eval=100)\n\n    model.predict(X_test4,num_iteration=model.best_iteration)\n    \n "

In [59]:
# XGBMでグリッドサーチを行う
# params = {'eta':[0.1,0.5], 'gamma':[0.01, 0.001], 'max_depth':[9, 12]}

# grid = GridSearchCV(model, params, verbose=1)
# grid.fit(X_train4, y_train4)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.






[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 63.2min finished




GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eta=0.5, gamma=0.01,
                                     gpu_id=-1, importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.5, max_delta_step=0,
                                     max_depth=6, min_child_weight=1,
                                     missing=nan, monotone_constraints='()',
                                     n_estimators=100, n_jobs=8,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact', validate_parameters=1,
                                     verbosity=None),
       

In [60]:
# グリッドサーチの指標
grid.best_params_

{'eta': 0.1, 'gamma': 0.001, 'max_depth': 12}

In [61]:
# ベストスコア
grid.best_score_

0.897028413629427

In [108]:
# グリッドサーチで評価した指標を投入する
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=22)

score_list3 = []

for i, (train_index, test_index) in enumerate(skf.split(new_app_train, y)):
    X_train4 = new_app_train.loc[train_index]
    y_train4 = y.iloc[train_index]
    X_test4 = new_app_train.iloc[test_index]
    y_test4 = y.iloc[test_index]
    
    #Jsonエラー修正
    X_train4.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train4.columns]
    X_test4.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test4.columns]

    #train、testデータ作成〜予測
    model = xgb.XGBClassifier(eta=0.1, gamma=0.001, max_depth=12)
    model.fit(X_train4, y_train4)
    pred = model.predict_proba(X_test4)[:, 1]
    scores = roc_auc_score( y_test4, pred)
    score_list3.append(scores)
    print(scores)



0.7462778968213293
0.7398851605295693
0.7480723300570565


In [122]:
# クロスバリデーションなし　グリッドサーチなし 
# クロスバリデーションあり　グリッドサーチなし
# クロスバリデーションあり　グリッドサーチあり　にて指標をまとめてみる

score_list1 = np.array(score_list1)
score_list2 = np.array(score_list2).reshape(-1,1)
score_list3 = np.array(score_list3).reshape(-1,1)

# 配列が正しくないので整理
zero = np.zeros(2)
score_list1_merge = np.concatenate([score_list1, zero], axis=0)

In [124]:
# 配列を２次元に変更する
score_list1_merge = score_list1_merge.reshape(-1,1)

In [128]:
# 整えてマージする
total_score =np.concatenate([score_list1_merge, score_list2, score_list3], axis=1)
total_score

array([[0.74476469, 0.74747004, 0.7462779 ],
       [0.        , 0.74236153, 0.73988516],
       [0.        , 0.75146317, 0.74807233]])

In [130]:
 # カラム名で、左から順に
# クロスバリデーションなし　グリッドサーチなし 
# クロスバリデーションあり　グリッドサーチなし
# クロスバリデーションあり　グリッドサーチあり
total_score_pd = pd.DataFrame(total_score, columns = ["cross, grid None", "cross only", "cross + grid"])
total_score_pd

Unnamed: 0,"cross, grid None",cross only,cross + grid
0,0.744765,0.74747,0.746278
1,0.0,0.742362,0.739885
2,0.0,0.751463,0.748072


In [None]:
'''
考察
左と真ん中で、クロスバリデーションだけで微妙だけど上昇したことがわかる
真ん中と右で、グリッドサーチで実施した内容は全部スコアが低いので、再度パラメータのチューニングを実施した方が良い状況
Γで少しスコアが悪くなっている可能性がある。
XGBoostでは再度検証をすべきだが、検証時間の兼ね合いでここでは割愛する。

'''

# 【問題5】最終的なモデルの選定
最終的にこれは良いというモデルを選び、推定した結果をKaggleに提出してスコアを確認してください。どういったアイデアを取り入れ、どの程度のスコアになったかを記載してください。



In [178]:
# クロスバリデーションのpredを提出する

pred = model.predict_proba(new_app_test)[:, 1]
submission_id = new_app_test['SK_ID_CURR']

submission = pd.DataFrame(pred, columns=['TARGET']).reset_index(drop=True)
submission = pd.concat([submission_id, submission], axis=1).reset_index(drop=True)
submission.to_csv(dir_path + "submission_gbm.csv", index=False)

In [None]:
# 検証結果
# 前：0.74238
# 今：0.72855
'''
以前はlightGBMを利用し提出を行った
この際は、通常のホールアウト法を利用してグリッドサーチは利用せず出力されたスコア。
今回は、グリッドサーチの結果があまり良くなかったため、XGBoostが好ましくないとは言い切れないが
lightGBMもグリッドサーチを行っていないので、lightGBMに軍ぱいがあると言える。
とはいえlightGBMだけ実施すれば良い物でもないのでアルゴリズムを求める場合両方追求する必要性がある。
'''