# Recursive Feature Elimination (RFE)

In [16]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# 設定中文字型
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  
# 矯正負號
plt.rcParams['axes.unicode_minus'] = False

## 載入資料

In [17]:
X, y = load_wine(return_X_y=True, as_frame=True)

## 資料分割

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.5, random_state=42)

## 特徵縮放

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 選擇演算法

In [20]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [21]:
clf.fit(X_train_std, y_train)

## 模型評估

In [22]:
clf.score(X_test_std, y_test)

0.9887640449438202

## 測試選取3個特徵的所有組合

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# RFE 使用的評估演算法必須支援 coef_ or feature_importances_ 屬性，KNN 不適用
clf = LogisticRegression()
model = RFE(clf, n_features_to_select=3)
model.fit(X_train_std, y_train)
model.get_support()

array([False, False, False, False, False, False,  True, False, False,
        True, False, False,  True])

In [24]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list())
column_list[model.get_support()]

array(['flavanoids', 'color_intensity', 'proline'], dtype='<U28')

In [25]:
# 特徵選取名稱
model.get_feature_names_out(column_list)

array(['flavanoids', 'color_intensity', 'proline'], dtype=object)

In [26]:
# 特徵選取後的 X
model.transform(X_train_std).shape

(89, 3)

## 選擇演算法

In [27]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [28]:
clf.fit(model.transform(X_train_std), y_train)

## 模型評估

In [29]:
clf.score(model.transform(X_test_std), y_test)

0.9662921348314607

## 模型簡化，準確率降低不顯著

## RFECV

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
# cv: 交叉驗證(cross validation)
model = RFECV(clf, min_features_to_select=3, cv=5)
model.fit(X_train_std, y_train)
model.get_support()

array([ True, False,  True,  True, False, False,  True, False,  True,
        True,  True,  True,  True])

In [31]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list())
column_list[model.get_support()]

array(['alcohol', 'ash', 'alcalinity_of_ash', 'flavanoids',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'], dtype='<U28')

In [32]:
# 特徵選取名稱
model.get_feature_names_out(column_list)

array(['alcohol', 'ash', 'alcalinity_of_ash', 'flavanoids',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'], dtype=object)

In [33]:
# 特徵選取後的 X
model.transform(X_train_std).shape

(89, 9)

## 選擇演算法

In [34]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [35]:
clf.fit(model.transform(X_train_std), y_train)

## 模型評估

In [37]:
clf.score(model.transform(X_test_std), y_test)

0.9887640449438202