# Permutation Feature Importance(PFI)

In [1]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# 設定中文字型
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  
# 矯正負號
plt.rcParams['axes.unicode_minus'] = False

## 載入資料

In [2]:
X, y = load_wine(return_X_y=True, as_frame=True)

## 資料分割

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.5, random_state=42)

## 特徵縮放

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 選擇演算法

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [6]:
clf.fit(X_train_std, y_train)

## 模型評估

In [7]:
clf.score(X_test_std, y_test)

0.9887640449438202

## 測試選取3個特徵的所有組合

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)
model = permutation_importance(clf, X_test_std, y_test, n_repeats=10)
model.importances_mean, model.importances_std

(array([ 4.15730337e-02,  1.01123596e-02,  2.35955056e-02,  1.12359551e-03,
         1.12359551e-02, -6.74157303e-03,  1.34831461e-02, -1.11022302e-17,
        -1.12359551e-03,  4.38202247e-02,  1.01123596e-02,  1.01123596e-02,
         5.39325843e-02]),
 array([0.01511643, 0.01460674, 0.01276159, 0.00933329, 0.01230837,
        0.01348315, 0.01651341, 0.01329456, 0.01910112, 0.02038018,
        0.01842834, 0.01773004, 0.01490618]))

In [26]:
# 特徵選取名稱
column_list = np.array(X.columns.to_list())

column_selected = []
for i in model.importances_mean.argsort()[::-1]:
    if model.importances_mean[i] - 2 * model.importances_std[i] > 0:
        print(f"{column_list[i]:20s} "
              f"{model.importances_mean[i]:.3f}"
              f" +/- {model.importances_std[i]:.3f}")
        column_selected.append(column_list[i])

proline              0.054 +/- 0.015
color_intensity      0.044 +/- 0.020
alcohol              0.042 +/- 0.015


In [32]:
# 特徵選取後的 X
X[column_selected].shape

(178, 3)

In [33]:
X = X[column_selected]

## 資料分割

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.5, random_state=42)

## 特徵縮放

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## 選擇演算法

In [36]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

## 模型訓練

In [38]:
clf.fit(X_train_std, y_train)

## 模型評估

In [39]:
clf.score(X_test_std, y_test)

0.898876404494382

## 模型簡化，準確率降低