In [5]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

In [4]:
from itertools import combinations
value = list(range(1,6))
print(value)
cvalue = combinations(value,3)
for v in cvalue:
    print(v)


[1, 2, 3, 4, 5]
(1, 2, 3)
(1, 2, 4)
(1, 2, 5)
(1, 3, 4)
(1, 3, 5)
(1, 4, 5)
(2, 3, 4)
(2, 3, 5)
(2, 4, 5)
(3, 4, 5)


In [23]:
X,y =  load_wine(return_X_y=True,as_frame=True)
feature_name = X.columns
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
N = 5
base_acc = 0
base_freature = None
for subset in combinations(feature_name,N):
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train[list(subset)],y_train)
    y_pred = model.predict(X_test[list(subset)])
    acc = accuracy_score(y_test,y_pred)
    if acc > base_acc:
        base_acc = acc
        base_freature = subset
print("特徵:",base_freature)        
print("準確率:",f"{base_acc:.3f}")        


特徵: ('alcohol', 'ash', 'total_phenols', 'flavanoids', 'hue')
準確率: 0.981


In [31]:
from sklearn.linear_model import LogisticRegression
X,y =  load_wine(return_X_y=True,as_frame=True)
feature_name = X.columns
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
N = 3
base_acc = 0
base_freature = None
for subset in combinations(feature_name,N):
    model = LogisticRegression(max_iter=2000)
    model.fit(X_train[list(subset)],y_train)
    y_pred = model.predict(X_test[list(subset)])
    acc = accuracy_score(y_test,y_pred)
    if acc > base_acc:
        base_acc = acc
        base_freature = subset
print("特徵:",base_freature)        
print("準確率:",f"{base_acc:.3f}")     

特徵: ('alcohol', 'flavanoids', 'proline')
準確率: 0.981


# Recursive Feature Elimination (RFE)

In [32]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
X,y = load_wine(return_X_y=True,as_frame=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

## 特徵縮放

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 選擇演算法

In [35]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)

In [36]:
clf.score(X_test,y_test)

0.9814814814814815

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# RFE 使用的評估演算法必須支援 coef_ or feature_importances_ 屬性，KNN 不適用
model = RFE(clf,n_features_to_select=3)
model.fit(X_train,y_train)
model.get_support()
"""
array([False, False, False, False, False, False,  True, False, False,
        True, False, False,  True])
"""

array([False, False, False, False, False, False,  True, False, False,
       False,  True, False,  True])

In [39]:
column_list = np.array(X.columns.to_list())
column_list[model.get_support()]

array(['flavanoids', 'hue', 'proline'], dtype='<U28')

In [41]:
model.transform(X_train).shape

(124, 3)

In [42]:
from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(model.transform(X_train),y_train)

In [43]:
clf.score(model.transform(X_test),y_test)

0.9814814814814815

## RFECV

初始模型訓練：首先，RFECV使用提供的模型和所有特徵來進行訓練。

特徵重要性評估：模型訓練完成後，會根據feature_importances_屬性（對於樹模型如決策樹、隨機森林等）或係數（對於線性模型如線性回歸、邏輯回歸等）來評估每個特徵的重要性。

移除最不重要的特徵：接下來，RFECV會移除被評估為最不重要的一個或多個特徵。

重複訓練與評估：移除特徵後，RFECV會用剩下的特徵再次訓練模型，並使用交叉驗證（Cross-Validation, CV）來計算模型的性能分數。

交叉驗證確認重要性：通過交叉驗證的結果，RFECV可以驗證移除某些特徵後模型性能的變化。如果模型性能沒有顯著下降，則可以認為移除的特徵不是很重要。

選擇最佳特徵子集：RFECV會重複上述步驟，直到達到指定的特徵數量或者再移除特徵不再提高交叉驗證分數為止。最終選擇的特徵子集是在交叉驗證中表現最佳的那一組。

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# cv: 交叉驗證(cross validation)
#交叉做5次
#min_features_to_select 最少選取3個特徵
model = RFECV(clf,min_features_to_select=3,cv=5)
model.fit(X_train,y_train)
model.get_support()

array([ True, False,  True,  True, False, False,  True, False, False,
        True,  True,  True,  True])

In [45]:
column_list = np.array(X.columns.to_list())
column_list[model.get_support()]

array(['alcohol', 'ash', 'alcalinity_of_ash', 'flavanoids',
       'color_intensity', 'hue', 'od280/od315_of_diluted_wines',
       'proline'], dtype='<U28')

In [46]:
from sklearn.linear_model import LogisticRegression 
clf = LogisticRegression()
clf.fit(model.transform(X_train),y_train)

In [47]:
clf.score(model.transform(X_test),y_test)

1.0

In [48]:
X,y = load_wine(return_X_y=True,as_frame=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
from sklearn.preprocessing import StandardScaler
scaler  = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [49]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_std,y_train)
clf.score(X_test_std,y_test)


0.9814814814814815

In [58]:
from sklearn.inspection import permutation_importance
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_std,y_train)
model = permutation_importance(clf,X_test_std,y_test,n_repeats=10)
model.importances_mean,model.importances_std

(array([ 0.04074074,  0.01296296,  0.0037037 ,  0.00925926,  0.00740741,
        -0.01296296,  0.00555556,  0.00925926,  0.00925926,  0.05185185,
        -0.01111111, -0.00555556,  0.07407407]),
 array([0.02456759, 0.02037037, 0.01814437, 0.02378747, 0.0122838 ,
        0.01861088, 0.01666667, 0.0124226 , 0.00925926, 0.02312962,
        0.02222222, 0.01185764, 0.02618914]))

In [63]:
column_selectd = []
meanArgSort = model.importances_mean.argsort()
print(meanArgSort)
for i in  meanArgSort[::-1]:
    if model.importances_mean[i] - 2 * model.importances_std[i] > 0:
        print(f"column:{X.columns[i]:20s}",f"{ model.importances_mean[i]:.3f}",f"{model.importances_std[i]:.3f}")
        column_selectd.append(X.columns[i])
print(column_selectd)        
        

[ 5 10 11  2  6  4  3  7  8  1  0  9 12]
column:proline              0.074 0.026
column:color_intensity      0.052 0.023
['proline', 'color_intensity']


In [64]:
X,y = load_wine(return_X_y=True,as_frame=True)

X = X[column_selectd]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
from sklearn.preprocessing import StandardScaler
scaler  = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)




In [65]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_std,y_train)
clf.score(X_test_std,y_test)

0.9074074074074074

# Scikit-learn PCA 實作

In [66]:
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [67]:
ds = datasets.load_wine()
df = pd.DataFrame(ds.data,columns=ds.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [70]:
from sklearn.model_selection import train_test_split
X = df.values
y = ds.target
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((124, 13), (54, 13), (124,), (54,))

In [71]:
from sklearn.preprocessing import StandardScaler
scaler  = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 特徵萃取(PCA)

In [72]:
from sklearn.decomposition import PCA
pca1 = PCA(n_components=2)
X_train_pca = pca1.fit_transform(X_train_std)
X_test_pca = pca1.transform(X_test_std)
X_train_pca.shape,X_test_pca.shape

((124, 2), (54, 2))

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf = LogisticRegression()
clf.fit(X_train_pca,y_train)
y_pred = clf.predict(X_test_pca)
print(f"{accuracy_score(y_test,y_pred)*100:.2f}%")

96.30%
