# 管線測試

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_wine

In [2]:
# Kfold 不可使用 DataFrame
X, y = load_wine(return_X_y=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
# 管線定義
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(random_state=1))

In [5]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
    
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)


scores = []
for k, (train, test) in enumerate(kfold):
    print(train.shape, test.shape)
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    
    # np.bincount：統計每一折的 y value_counts() 
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

(113,) (29,)
Fold:  1, Class dist.: [40 44 29], Acc: 1.000
(113,) (29,)
Fold:  2, Class dist.: [40 44 29], Acc: 1.000
(114,) (28,)
Fold:  3, Class dist.: [40 44 30], Acc: 0.929
(114,) (28,)
Fold:  4, Class dist.: [40 44 30], Acc: 0.929
(114,) (28,)
Fold:  5, Class dist.: [40 44 30], Acc: 0.964

CV accuracy: 0.964 +/- 0.032
