# Permutation Importance
- 만들어진 모델에 대해서 특정 feature의 값을 랜덤하게 섞었을 때 정확도 등의 지표가 어떻게 바뀌는지를 확인하는 중요도
- 알고리즘
    1. 훈련된 모델을 가져온다.
    2. 테스트 데이터에 대하여 하나의 열을 정해 랜덤으로 섞고 모델로 예측하여 원래의 손실 함수와 비교하여 얼마나 성능이 저하되었는지를 중요도 지표로 삼는다.
    3. 원래 데이터로 복구시킨 후 다른 모든 열에 대하여 반복한다.

# eli5

In [1]:
import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
names = [
    'age', 'workclass', 'fnlwgt', 'education',
    'education-num', 'marital-status', 'occupation',
    'relationship', 'race', 'gender', 'capital-gain',
    'capital-loss', 'hours-per-week', 'native-country',
    'salary'
]
dtypes = {
    'workclass': 'category',
    'education': 'category',
    'marital-status': 'category',
    'occupation': 'category',
    'relationship': 'category',
    'race': 'category',
    'gender': 'category',
    'native-country': 'category'
}

X = pd.read_csv(url, names=names, header=None, dtype=dtypes)
X['gender'] = X['gender'].str.strip().astype('category')  # Remove leading whitespace

In [2]:
from sklearn.preprocessing import OrdinalEncoder

# categorical = list(dtypes.keys())
categorical = [col for col in X.columns if X.dtypes[col] == 'category']

oe = OrdinalEncoder()
X_oe = pd.DataFrame(oe.fit_transform(X[categorical]), columns=categorical)
X[categorical] = X_oe

y = X.pop('salary').map({' <=50K': 0, ' >50K': 1})

X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,7.0,77516,9.0,13,4.0,1.0,1.0,4.0,1.0,2174,0,40,39.0
1,50,6.0,83311,9.0,13,2.0,4.0,0.0,4.0,1.0,0,0,13,39.0
2,38,4.0,215646,11.0,9,0.0,6.0,1.0,4.0,1.0,0,0,40,39.0
3,53,4.0,234721,1.0,7,2.0,6.0,0.0,2.0,1.0,0,0,40,39.0
4,28,4.0,338409,9.0,13,2.0,10.0,5.0,2.0,0.0,0,0,40,5.0


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

## 학습

In [4]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)





In [5]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, pred)
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.8649


## `PermutationImportance`
- `scoring`: 평가 지표. https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values

### 결과 해석
- Weight는 random으로 섞은 뒤 성능이 얼마나 떨어졌는지를 알려주는 것이며 ±는 무작위성에 의한 변동성을 나타낸다.
- 음수는 보통 적은 데이터셋에서 나타난다.

In [8]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(xgb, random_state=1, scoring='accuracy').fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0443  ± 0.0019,capital-gain
0.0283  ± 0.0064,education-num
0.0222  ± 0.0046,relationship
0.0171  ± 0.0018,age
0.0140  ± 0.0020,capital-loss
0.0139  ± 0.0047,occupation
0.0089  ± 0.0028,hours-per-week
0.0051  ± 0.0025,marital-status
0.0018  ± 0.0019,gender
0.0018  ± 0.0005,race


In [9]:
perm = PermutationImportance(xgb, random_state=1, scoring='f1').fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0916  ± 0.0061,capital-gain
0.0906  ± 0.0101,relationship
0.0830  ± 0.0134,education-num
0.0620  ± 0.0035,age
0.0475  ± 0.0126,occupation
0.0333  ± 0.0039,capital-loss
0.0312  ± 0.0066,marital-status
0.0293  ± 0.0081,hours-per-week
0.0105  ± 0.0057,gender
0.0042  ± 0.0017,race


## 장점
- 계산이 빠르다.
- 직관적이다.

## 단점
- `random_state`마다 결과가 달라질 수 있다.

In [10]:
perm = PermutationImportance(xgb, random_state=5, scoring='accuracy').fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0458  ± 0.0024,capital-gain
0.0289  ± 0.0011,education-num
0.0218  ± 0.0030,relationship
0.0189  ± 0.0042,age
0.0146  ± 0.0030,occupation
0.0142  ± 0.0015,capital-loss
0.0084  ± 0.0041,hours-per-week
0.0058  ± 0.0040,marital-status
0.0030  ± 0.0018,workclass
0.0023  ± 0.0012,gender
