# 特徵轉換加入機器學習管線

將PCA與LDA加入到機器學習的管線中，觀察做完特徵轉換後的成效差異

In [1]:
#匯入資料集
from sklearn.datasets import load_iris

iris = load_iris()

iris_X, iris_y = iris.data, iris.target

In [2]:
#匯入機器學習模組
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

#匯入特徵轉換模組
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#匯入特徵選擇模組
from sklearn.feature_selection import SelectKBest

#匯入Grid Search模組
from sklearn.model_selection import GridSearchCV

#匯入標準化模組
from sklearn.preprocessing import StandardScaler

## 簡易LDA與PCA機器學習

In [3]:
#建立只有一個主成分的PCA模組
single_pca = PCA(n_components=1)

#建立只有一個判別式的LDA模組
single_lda = LinearDiscriminantAnalysis(n_components=1)

#產生實體KNN模型
knn = KNeighborsClassifier(n_neighbors=3)

In [4]:
#先不做任何轉換，觀察基準KNN準確率
knn_average = cross_val_score(knn, iris_X, iris_y).mean()
knn_average

0.9666666666666668

> 什麼都不做，knn分類就可以達到96%的accuracy

In [5]:
#利用單一判別式lda做特徵轉換
lda_pipeline = Pipeline([('lda', single_lda), ('knn', knn)])

lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()

lda_average

0.9666666666666666

In [6]:
#利用單一主成分pca做特徵轉換
pca_average = Pipeline([('pca', single_pca), ('knn', knn)])

pca_average = cross_val_score(pca_average, iris_X, iris_y).mean()

pca_average

0.9

In [7]:
#觀察多加一個判別式會不會比較好
lda_pipeline = Pipeline([('lda', LinearDiscriminantAnalysis(n_components=2)), ('knn', knn)])

lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()
lda_average

0.9733333333333334

## 利用特徵選擇方法對比特徵轉換成效

In [8]:
#我們嘗試所有k值，k代表我們選擇幾個特徵，但我們不選擇全部的特徵，目的是對比特徵轉換的結果
#假設特徵有A、B、C、D，k=2表示會嘗試A&B、A&C、A&D、B&C、B&D、C&D，六種組合

for k in [1,2,3]:
    
    #建構管線
    select_pipeline = Pipeline([('select', SelectKBest(k=k)), ('knn', knn)])
    
    #交叉驗證管線
    select_average = cross_val_score(select_pipeline, iris_X, iris_y).mean()
    
    print(k, "best feature has accuracy:", select_average)

1 best feature has accuracy: 0.9533333333333334
2 best feature has accuracy: 0.9533333333333334
3 best feature has accuracy: 0.9733333333333334


## 建立模型評估指標

In [9]:
#製作一個function觀察各個模型以及選擇參數的好壞
def get_best_model_and_accuracy(model, params, X, y):
    
    grid = GridSearchCV(model,
                       params,
                       error_score=0.)
    grid.fit(X, y)
    
    print("Best Accuracy: {}".format(grid.best_score_))
    print("Best Parameters: {}".format(grid.best_params_))
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

## 結合所有方法

1. 資料縮放
2. 聯合使用LDA與PCA
3. 機器學習分類

In [10]:
#建立pipeline參數
iris_params = {
    'preprocessing__scale__with_std':[True, False],
    'preprocessing__scale__with_mean':[True, False],
    'preprocessing__pca__n_components':[1,2,3,4],
    'preprocessing__lda__n_components':[1,2],
    'clf__n_neighbors':range(1,9)
}

In [11]:
#預處理管線
preprocessing = Pipeline([('scale', StandardScaler()),
                         ('pca', PCA()),
                         ('lda', LinearDiscriminantAnalysis())])

#建立整體管線
iris_pipeline = Pipeline(steps=[('preprocessing', preprocessing),
                               ('clf', KNeighborsClassifier())])

In [12]:
#觀察各個參數的成效，並從中得到最佳模型參數
get_best_model_and_accuracy(iris_pipeline, iris_params, iris_X, iris_y)

Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Traceback (most recent call last):
  File "c:\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\python38\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\python38\lib\site-packages\sklearn\pipeline.py", line 376, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "c:\python38\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)


Best Accuracy: 0.9933333333333334
Best Parameters: {'clf__n_neighbors': 8, 'preprocessing__lda__n_components': 1, 'preprocessing__pca__n_components': 3, 'preprocessing__scale__with_mean': True, 'preprocessing__scale__with_std': False}
Average Time to Fit (s): 0.001
Average Time to Score (s): 0.001
