# Stratefied method

In [2]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
data = load_breast_cancer()
X = data.data
y = data.target

In [4]:
ss = StratifiedShuffleSplit(n_splits=1,
                           train_size=0.95,
                           test_size=0.05,
                           random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]

In [5]:
print(np.unique(y, return_counts=True))


(array([0, 1]), array([212, 357], dtype=int64))


# 10-fold CV method

In [6]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

# KFold法を使用
from sklearn.model_selection import KFold
ss = KFold(n_splits=10, shuffle=True)

for train_index, test_index in ss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf.fit(X_train, y_train)
    
    # Stratifiedでないデータ[郡１，郡２]のように見る
    # => 郡１、郡２の比率がバラバラなので、学習とテストがうまくいかない
    print(np.unique(y_train, return_counts=True)[1] / y_train.size, y_train.size, 
          np.unique(y_test,  return_counts=True)[1] / y_test.size,  y_test.size)


[0.38671875 0.61328125] 512 [0.24561404 0.75438596] 57
[0.37695312 0.62304688] 512 [0.33333333 0.66666667] 57
[0.36328125 0.63671875] 512 [0.45614035 0.54385965] 57
[0.3671875 0.6328125] 512 [0.42105263 0.57894737] 57
[0.36523438 0.63476562] 512 [0.43859649 0.56140351] 57
[0.36523438 0.63476562] 512 [0.43859649 0.56140351] 57
[0.3671875 0.6328125] 512 [0.42105263 0.57894737] 57
[0.39453125 0.60546875] 512 [0.1754386 0.8245614] 57
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.36647173 0.63352827] 513 [0.42857143 0.57142857] 56


In [7]:
## Stratified化のやり方！

# StratifiedKFold法を使用
from sklearn.model_selection import StratifiedKFold
ss = StratifiedKFold(n_splits=10, shuffle=True)

for train_index, test_index in ss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf.fit(X_train, y_train)
    
    # Stratifiedのデータ[郡１，郡２]のように見る  => 郡１、郡２の比率はほぼ同じ！！
    print(np.unique(y_train, return_counts=True)[1] / y_train.size, y_train.size, 
          np.unique(y_test,  return_counts=True)[1] / y_test.size,  y_test.size)


[0.37181996 0.62818004] 511 [0.37931034 0.62068966] 58
[0.37181996 0.62818004] 511 [0.37931034 0.62068966] 58
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.37304688 0.62695312] 512 [0.36842105 0.63157895] 57
[0.37231969 0.62768031] 513 [0.375 0.625] 56
[0.37231969 0.62768031] 513 [0.375 0.625] 56
[0.37231969 0.62768031] 513 [0.375 0.625] 56


In [9]:
from sklearn.model_selection import cross_val_score
ave_score = cross_val_score(clf, 
                                           X, y,
                                           cv=10) # StratifiedKFold

In [10]:
print("{0:4.2f} +/- {1:4.2f} %".format(ave_score.mean() * 100, ave_score.std() * 100))

95.09 +/- 1.85 %


## Leave One Out

In [40]:
import numpy as np 

from sklearn.linear_model import LinearRegression
clf = LinearRegression()

from sklearn.model_selection import LeaveOneOut
loocv = LeaveOneOut()

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [41]:
train_index, test_index = next(loocv.split(X, y))

In [42]:
y.size, train_index.size, test_index.size

(569, 568, 1)

In [43]:
## ループよりもCross Validationを使って、簡潔に記述できる
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, 
                         X, y,
                        cv=loocv) # LeaveOneOut
print("mean: {0:4.1f}%,  std: {1:4.1f}%,  size: {2:4.1f}".format(scores.mean() * 100, scores.std() * 100, scores.size))

mean:  0.0%,  std:  0.0%,  size: 569.0


## Leave P out
Leave One Outはデータから１つを選び出してテストデータとするが、Leave P Outは複数個選び出してテストデータとする

In [47]:
import numpy as np 

from sklearn.linear_model import LinearRegression
clf = LinearRegression()

from sklearn.model_selection import LeavePOut
loocv = LeavePOut(2) 
# ２つ選び出し、テストデータとする
# 注意！かなり時間がかかる。複数個から2つ抜き出してやる場合の数は膨大

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

## Leave One Group Out

In [57]:
group = np.array(list(range(50)) * 12) # 擬似的なグループを作成！！ 1から49のグループ
group = np.sort(group[:y.size])
group.size

569

In [54]:
from sklearn.model_selection import LeaveOneGroupOut
loocv = LeaveOneGroupOut()

for train_index, test_index in loocv.split(X, y, group):
    
    X_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

ValueError: Found input variables with inconsistent numbers of samples: [12, 56]

In [56]:
## ループよりもCross Validationを使って、簡潔に記述できる
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, 
                         X, y,
                         groups=group,
                        cv=loocv) # LeaveOneOut
print("mean: {0:4.1f}%,  std: {1:4.1f}%,  size: {2:4.1f}".format(scores.mean() * 100, scores.std() * 100, scores.size))

mean: 58.4%,  std: 31.1%,  size: 50.0
