# 【問題1】クロスバリデーション
* train_test_splitではなく、クロスバリデーションを用いる
* クロスバリデーションには、scikit-learnのKFold ライブラリを使用する  

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]])
y = np.array([0, 1, 2, 3, 4, 1, 1, 1, 1, 1])

# 5分割
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_train:", X_train, "\nX_test:", X_test, "\ny_train:", y_train, "\ny_test:", y_test)

X_train: [[3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[1 2]
 [3 4]
 [1 2]
 [3 4]
 [3 4]
 [3 4]] 
y_train: [1 1 1 1] 
y_test: [0 1 2 3 4 1]
X_train: [[1 2]
 [3 4]
 [1 2]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[3 4]] 
y_train: [0 1 2 3 4 1 1 1 1] 
y_test: [1]
X_train: [[1 2]
 [3 4]
 [1 2]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[3 4]] 
y_train: [0 1 2 3 4 1 1 1 1] 
y_test: [1]
X_train: [[1 2]
 [3 4]
 [1 2]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[3 4]] 
y_train: [0 1 2 3 4 1 1 1 1] 
y_test: [1]
X_train: [[1 2]
 [3 4]
 [1 2]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]
 [3 4]] 
X_test: [[3 4]] 
y_train: [0 1 2 3 4 1 1 1 1] 
y_test: [1]




* Home Credit Default Risk コンペティションのデータセットでやってみる
* stratifyする

In [5]:
import pandas as pd

df_train = pd.read_csv('application_train.csv')

# 説明変数
X = df_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

# 目的変数
y = df_train['TARGET']

# 5分割
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X, y):
    print(train_index)
    print(test_index)
    # すげー出てくるのでコメントアウト
    # X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    # y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #print("X_train:", X_train, "\nX_test:", X_test, "\ny_train:", y_train, "\ny_test:", y_test)

<class 'pandas.core.frame.DataFrame'>
[ 61465  61466  61467 ... 307508 307509 307510]
[    0     1     2 ... 61814 61816 61828]
[     0      1      2 ... 307508 307509 307510]
[ 61465  61466  61467 ... 123093 123094 123095]
[     0      1      2 ... 307508 307509 307510]
[121993 122029 122030 ... 184624 184625 184626]
[     0      1      2 ... 307508 307509 307510]
[183271 183273 183278 ... 246095 246097 246099]
[     0      1      2 ... 246095 246097 246099]
[244997 245014 245015 ... 307508 307509 307510]


# 【問題2】グリッドサーチ
* グリッドサーチをパイプラインの中に組み込む
* ついでにパイプラインを修正してクラス化してみたくなった

In [55]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

class CPL():
    """
    分類汎用パイプライン

    Attributes
    ----------
    X_: df
        説明変数たち
    y_: df
        目的変数
    score_ : dict
        評価値
    """
    def __init__(self, X, y):
        self.X_ = X
        self.y_ = y
    
    
    def split(self, train_size=0.75):
        self.X_train_, self.X_test_, self.y_train_, self.y_test_ = train_test_split(self.X_, self.y_, train_size=train_size)
    
    
    def cross_validate(self, n_splits):
        self.cv_count_ = 0
        self.X_train_ = {}
        self.X_test_ = {}
        self.y_train_ = {}
        self.y_test_ = {}
        
        skf = StratifiedKFold(n_splits=n_splits)
        
        for train_index, test_index in skf.split(X, y):
            self.X_train_[self.cv_count_] = self.X_.iloc[train_index]
            self.X_test_[self.cv_count_] = self.X_.iloc[test_index]
            self.X_train_[self.cv_count_] = self.X_.iloc[train_index]
            self.X_test_[self.cv_count_] = self.X_.iloc[test_index]
            self.cv_count_ = self.cv_count_ + 1
            
            
    def grid_search(self, model, tune_params, score):
        
        scores = {}
        
        clf = GridSearchCV(
            model,
            tune_params,
            cv=self.cv_count_,
            scoring='%s_weighted' % score )
        
        for i in range(self.cv_count_):
            clf.fit(self.X_train_[i], self.y_train_[i])
            scores[i] = clf.grid_scores_
            
        return scores
        
        

In [56]:
import pandas as pd

df_train = pd.read_csv('application_train.csv')

# 説明変数
X = df_train.drop(['SK_ID_CURR', 'TARGET'], axis = 1)

# 目的変数
y = df_train['TARGET']

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

tune_params = [
    {'n_neighbors': [2], 'algorithm':  ['ball_tree'], 'leaf_size': [20]},
    {'n_neighbors': [3], 'algorithm':  ['kd_tree'], 'leaf_size': [25]}
]

score = 'accuracy'


cpl = CPL(X, y)

cpl.cross_validate(5)

print(cpl.y_train_)

cpl.grid_search(model, tune_params, score)

{}


KeyError: 0