# 【問題1】クロスバリデーション
* train_test_splitではなく、クロスバリデーションを用いる
* クロスバリデーションには、scikit-learnのKFold ライブラリを使用する  

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])
y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

# 2分割
skf = StratifiedKFold(n_splits=2)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_train:", X_train, "\nX_test:", X_test, "\ny_train:", y_train, "\ny_test:", y_test)

X_train: [[3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[1 2]
 [1 2]
 [3 4]
 [3 4]
 [3 4]] 
y_train: [0 1 1 1 1] 
y_test: [0 1 1 1 1]
X_train: [[1 2]
 [1 2]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
y_train: [0 1 1 1 1] 
y_test: [0 1 1 1 1]


* Home Credit Default Risk コンペティションのデータセットでやってみる
* stratifyする

In [2]:
import pandas as pd

df_train = pd.read_csv('../input/application_train.csv')

# 説明変数
X = df_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

# 目的変数
y = df_train['TARGET']

# 5分割
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X, y):
    print(train_index)
    print(test_index)
    # すげー出てくるのでコメントアウト
    # X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    # y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #print("X_train:", X_train, "\nX_test:", X_test, "\ny_train:", y_train, "\ny_test:", y_test)

[ 61465  61466  61467 ... 307508 307509 307510]
[    0     1     2 ... 61814 61816 61828]
[     0      1      2 ... 307508 307509 307510]
[ 61465  61466  61467 ... 123093 123094 123095]
[     0      1      2 ... 307508 307509 307510]
[121993 122029 122030 ... 184624 184625 184626]
[     0      1      2 ... 307508 307509 307510]
[183271 183273 183278 ... 246095 246097 246099]
[     0      1      2 ... 246095 246097 246099]
[244997 245014 245015 ... 307508 307509 307510]


# 【問題2】グリッドサーチ
* グリッドサーチをパイプラインの中に組み込む

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd


# 訓練データ
df_train = pd.read_csv('../input/application_train.csv')

# 説明変数(week3のeda結果より抜粋)
X = df_train.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                    'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_BIRTH']]

# 欠損を平均値で埋める
X = X.fillna(X.mean())

# 目的変数
y = df_train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

gred_params = {'C' : [0.001, 0.01, 0.1, 1, 10, 100], 'solver' : ['lbfgs']}

# cvでn_splitsを指定してクロスバリデーションされる
clf = GridSearchCV(LogisticRegression(), gred_params, cv=5, scoring='roc_auc')
clf.fit(X_train, y_train)

print("Best parameters : {}".format(clf.best_params_))
print("Best cross-validation score : {:.3f}".format(clf.best_score_))
print("LogisticRegression Best AUC:{}".format(roc_auc_score(y_test, clf.decision_function(X_test))))

Best parameters : {'C': 0.001, 'solver': 'lbfgs'}
Best cross-validation score : 0.591
LogisticRegression Best AUC:0.5900027129004467


# 脱線
- ついでにパイプラインを修正してクラス化してみたくなった

In [5]:
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# TODO:看板とかコメントちゃんとつける
class CPL():
    """
    分類汎用パイプライン

    Attributes
    ----------
    X_: df
        説明変数たち
    y_: df
        目的変数
    score_ : dict
        評価値
    """
    def __init__(self, X, y):
        self.X_ = X
        self.y_ = y
    
    
    def split(self, train_size=0.8):
        self.X_train_, self.X_test_, self.y_train_, self.y_test_ = train_test_split(self.X_, self.y_, train_size=train_size)
    
            
    def cross_validate(self, model, n_splits):
        skf = StratifiedKFold(n_splits=n_splits)
        
        auc_scores = []
        
        for train_index, test_index in skf.split(self.X_, self.y_):
            self.X_train_, self.X_test_ = self.X_.iloc[train_index], self.X_.iloc[test_index]
            self.y_train_, self.y_test_ = self.y_.iloc[train_index], self.y_.iloc[test_index]
            
            model.fit(self.X_train_, self.y_train_)
            
            self._calc_auc(model)
            
            auc_scores.append(self.auc_)
            
            
        return auc_scores
    
        
    def grid_search_cv(self, model, grid_params, cv=5, scoring='roc_auc'):
        self.clf_ = GridSearchCV(
            model,
            grid_params,
            cv=cv,
            scoring='roc_auc')
        
        self.clf_.fit(self.X_, self.y_)
        
        self.best_params_ = self.clf_.best_params_
        
        self.y_pred_ = self.clf_.predict_proba(self.X_test_)[:,1]
        
        self.best_auc_ = metrics.roc_auc_score(self.y_test_, self.y_pred_)
    
    
    def generate_pred_for_submission(self, test):
        self.y_pred_ = self.clf_.predict_proba(test)[:,1]
        
        return self.y_pred_
        
    
    def _calc_auc(self, model):
        # 予測
        self.y_pred_ = model.predict_proba(self.X_test_)[:,1]
        
        # FPR, TPR(, しきい値) を算出
        self.fpr_, self.tpr_, self.thresholds_ = metrics.roc_curve(self.y_test_, self.y_pred_)
        
        # auc算出
        self.auc_ = metrics.roc_auc_score(self.y_test_, self.y_pred_)
        
        

一旦ここまでで中止、、随時拡張していきたい  
様々なモデルでクロスバリデーションを試してみる(重たいので、複数セルで実施)

In [6]:
# 訓練データ
df_train = pd.read_csv('../input/application_train.csv')

# 説明変数(week3のeda結果より抜粋)
X = df_train.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                    'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_BIRTH']]

# 欠損を平均値で埋める
X = X.fillna(X.mean())

# 目的変数
y = df_train.loc[:, 'TARGET']


# 自作クラスインスタンス化
cpl = CPL(X, y)

In [7]:
import lightgbm as lgb

model = lgb.LGBMClassifier()


# クロスバリデーション実施
auc_scores = cpl.cross_validate(model, 5)
print(auc_scores)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


[0.729342152647506, 0.7305931623421007, 0.7242335148950484, 0.7327486098855563, 0.7338617452364475]


In [8]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()


# クロスバリデーション実施
auc_scores = cpl.cross_validate(model, 5)
print(auc_scores)

[0.5217685441587523, 0.522762040475735, 0.5274385420158417, 0.5230404151557675, 0.5240136871929852]


In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs')


# クロスバリデーション実施
auc_scores = cpl.cross_validate(model, 5)
print(auc_scores)

[0.5904042151226117, 0.5899166425622833, 0.5916606367857098, 0.5900143390132754, 0.5913547938849446]


In [10]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()


# クロスバリデーション実施
auc_scores = cpl.cross_validate(model, 5)
print(auc_scores)

[0.5359610182238206, 0.5402660461317554, 0.5359451477034504, 0.5344782225245075, 0.5407367624809006]


In [11]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()


# クロスバリデーション実施
auc_scores = cpl.cross_validate(model, 5)
print(auc_scores)



[0.6328369280780669, 0.6371727283335258, 0.6297689554101591, 0.6300951469882898, 0.6350958344508273]


In [12]:
# 異様に重いので不実施
#from sklearn.svm import SVC

#clf = SVC()


# クロスバリデーション実施
#auc_scores = cpl.cross_validate(clf, 5)
#print(auc_scores)

スコアが高かったlightgbmで
グリッドサーチ+クロスバリデーションを試してみる

In [13]:
import lightgbm as lgb

model = lgb.LGBMClassifier()

grid_params = {'lgb__num_leaves': range(2, 41, 5)}

cpl.split()

cpl.grid_search_cv(model, grid_params)

print(cpl.best_params_)
print(cpl.best_auc_)



{'lgb__num_leaves': 2}
0.7528421498334035


大体同じくらい  
この状態で提出してみる

In [14]:
from datetime import datetime

df_test = pd.read_csv("../input/application_test.csv")

X_test = df_test.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                                         'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_BIRTH']]

# 提出用
submission = pd.DataFrame({'SK_ID_CURR': df_test['SK_ID_CURR'], 
                           'TARGET': cpl.generate_pred_for_submission(X_test)})

# タイムスタンプ作成
time_stamp = datetime.now().strftime('%Y%m%d%H%M')
submission.to_csv('home_credit_' + time_stamp + '.csv', index=False)

Prt :0.66706 Pub0.66061とaucよりだいぶ低くなった、、

# 【問題3】Kernelからの調査
- 特徴量の選出・加工が微妙っぽいので、EDA・特徴量エンジニアリングを調査してやりたい

# 【問題4】高い汎化性能のモデル
- lgbmが良さそうなのはわかったので(アプローチは正しくないかもしれない、、)  
lgbmでグリッドサーチを引き続きやりたい  
<br><br>
- まずはEDAから。色々プロットするのも手間なので、すでにあるカーネルを参考にする
https://www.kaggle.com/codename007/home-credit-complete-eda-feature-importance
https://www.kaggle.com/gpreda/home-credit-default-risk-extensive-eda
- プロットの仕方も色々あることがわかった、みやすいのも多くあったので、今度試してみる
- PCA(主成分分析)  
→ 次元を圧縮するのに使う、処理が軽くなるのか？ 次回以降で調査してやってみる
- application_train・test以外のファイルにも有益な情報が多そうだった  
→ bureau・bureau_balanceが有益そう こちらも今回はパス
- 新たな特徴量？を作成してグラフ化してる方が多かった  
→ これはちょっと試したい
<br><br>
次に特徴量エンジニアリング、カーネルから自分になかったアイデアを列挙する。  
<br>
- 特徴量の作成
下記が参考になりそう。  
https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features#L61-L68  
week3から重要度が高い特徴量でもあるので、まんま流用してみる。　　


In [29]:
# 訓練データ
df = pd.read_csv('../input/application_train.csv')

#　以下流用
# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
# Some simple new features (percentages)
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']


# 説明変数(EXT_SOURCEシリーズ+作成した特徴量を採用)
X = df.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                    'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC', 'INCOME_PER_PERSON',
                    'ANNUITY_INCOME_PERC', 'PAYMENT_RATE']]


# 目的変数
y = df.loc[:, 'TARGET']

X.head()

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE
0,0.083037,0.262949,0.139376,0.067329,0.498036,202500.0,0.121978,0.060749
1,0.311267,0.622246,,0.070862,0.208736,135000.0,0.132217,0.027598
2,,0.555912,0.729567,0.011814,0.5,67500.0,0.1,0.05
3,,0.650442,,0.159905,0.431748,67500.0,0.2199,0.094941
4,,0.322738,,0.152418,0.236842,121500.0,0.179963,0.042623


おkそう  
試してみる

In [30]:
# 自作クラスインスタンス化
cpl = CPL(X, y)

import lightgbm as lgb

model = lgb.LGBMClassifier()

grid_params = {'lgb__num_leaves': range(2, 41, 5)}

cpl.split()

cpl.grid_search_cv(model, grid_params)

print(cpl.best_params_)
print(cpl.best_auc_)



{'lgb__num_leaves': 2}
0.7763226601076023


上がったので、この状態で提出

In [32]:
from datetime import datetime

df_test = pd.read_csv("../input/application_test.csv")

#　以下流用
# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df_test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
# Some simple new features (percentages)
df_test['DAYS_EMPLOYED_PERC'] = df_test['DAYS_EMPLOYED'] / df_test['DAYS_BIRTH']
df_test['INCOME_CREDIT_PERC'] = df_test['AMT_INCOME_TOTAL'] / df_test['AMT_CREDIT']
df_test['INCOME_PER_PERSON'] = df_test['AMT_INCOME_TOTAL'] / df_test['CNT_FAM_MEMBERS']
df_test['ANNUITY_INCOME_PERC'] = df_test['AMT_ANNUITY'] / df_test['AMT_INCOME_TOTAL']
df_test['PAYMENT_RATE'] = df_test['AMT_ANNUITY'] / df_test['AMT_CREDIT']


# 説明変数(EXT_SOURCEシリーズ+作成した特徴量を採用)
X_test = df_test.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                    'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC', 'INCOME_PER_PERSON',
                    'ANNUITY_INCOME_PERC', 'PAYMENT_RATE']]

# 提出用
submission = pd.DataFrame({'SK_ID_CURR': df_test['SK_ID_CURR'], 
                           'TARGET': cpl.generate_pred_for_submission(X_test)})

# タイムスタンプ作成
time_stamp = datetime.now().strftime('%Y%m%d%H%M')
submission.to_csv('home_credit_' + time_stamp + '.csv', index=False)

Prt :0.74124 Pub 0.75443  
とめっちゃ上がった。。

- 最後にグリッドサーチして一番スコアがよかったものを採用して提出

In [None]:
# 自作クラスインスタンス化
cpl = CPL(X, y)

grid_params = {
        'lgb__num_leaves': [2],
        'n_estimators' : range(10,210,100),
        'objective':['binary'], # 二項分類
        'random_state' :[0]}

cpl.split()

cpl.grid_search_cv(model, grid_params)

print(cpl.best_params_)
print(cpl.best_auc_)

In [None]:
from datetime import datetime

df_test = pd.read_csv("../input/application_test.csv")

#　以下流用
# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df_test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
# Some simple new features (percentages)
df_test['DAYS_EMPLOYED_PERC'] = df_test['DAYS_EMPLOYED'] / df_test['DAYS_BIRTH']
df_test['INCOME_CREDIT_PERC'] = df_test['AMT_INCOME_TOTAL'] / df_test['AMT_CREDIT']
df_test['INCOME_PER_PERSON'] = df_test['AMT_INCOME_TOTAL'] / df_test['CNT_FAM_MEMBERS']
df_test['ANNUITY_INCOME_PERC'] = df_test['AMT_ANNUITY'] / df_test['AMT_INCOME_TOTAL']
df_test['PAYMENT_RATE'] = df_test['AMT_ANNUITY'] / df_test['AMT_CREDIT']


# 説明変数(EXT_SOURCEシリーズ+作成した特徴量を採用)
X_test = df_test.loc[:, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                    'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC', 'INCOME_PER_PERSON',
                    'ANNUITY_INCOME_PERC', 'PAYMENT_RATE']]

# 提出用
submission = pd.DataFrame({'SK_ID_CURR': df_test['SK_ID_CURR'], 
                           'TARGET': cpl.generate_pred_for_submission(X_test)})

# タイムスタンプ作成
time_stamp = datetime.now().strftime('%Y%m%d%H%M')
submission.to_csv('home_credit_' + time_stamp + '.csv', index=False)

# TODO
以降のアプローチは下記を試したい
- application_train・test以外のファイルを調査  
→ bureau・bureau_balanceが有益そう
- Null Importances なるものがあったのでこれも試してみたい 
https://www.kaggle.com/ogrellier/feature-selection-with-null-importances
- (PCA(主成分分析))