## 6.2　特徴選択および特徴量の重要度

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import scipy.stats as st
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

### 6.2.1　単変量統計を用いる方法

In [2]:
ary = np.array([10, 20, 30, 0])
idx = ary.argsort()
print(idx)
print(idx[::-1])

print(ary[::-1][:3])

[3 0 1 2]
[2 1 0 3]
[ 0 30 20]


`[::-1]` で降順にしている。

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x =train.drop(['Survived'], axis=1)
train_y = train['Survived']
test_x = test.copy()

train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)

train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

num_cols = [col for col in train_x.columns if train_x[col].dtype != 'object']
cat_cols = [col for col in train_x.columns if train_x[col].dtype == 'object']

train_x = train_x[num_cols]
test_x = test_x[num_cols]

train_x = train_x.fillna(train_x.mean())

print(train_x.columns)
train_x.head()

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


#### 相関係数（ピアソンの積率相関係数）

In [4]:
corrs = []
for c in train_x.columns:
    corr = np.corrcoef(train_x[c], train_y)[0, 1]
    corrs.append(corr)
corrs = np.array(corrs)

idx = np.argsort(np.abs(corrs))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['Pclass' 'Fare' 'Parch' 'Age' 'SibSp'] [-0.33848104  0.25730652  0.08162941 -0.06980852 -0.0353225 ]


#### スピアマンの順位相関係数

In [5]:
corrs_sp = []
for c in train_x.columns:
    corr_sp = st.spearmanr(train_x[c], train_y).correlation
    corrs_sp.append(corr_sp)
corrs_sp = np.array(corrs_sp)

idx2 = np.argsort(np.abs(corrs_sp))[::-1]
top_cols2, top_importances2 = train_x.columns.values[idx2][:5], corrs_sp[idx2][:5]
print(top_cols2, top_importances2)

['Pclass' 'Fare' 'Parch' 'SibSp' 'Age'] [-0.33966794  0.32373614  0.13826563  0.08887948 -0.03910946]


#### カイ二乗統計量

In [6]:
x = MinMaxScaler().fit_transform(train_x)
c2, _ = chi2(x, train_y)

idx = np.argsort(c2)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['Pclass' 'Fare' 'Parch' 'SibSp' 'Age'] [-0.33848104  0.25730652  0.08162941 -0.0353225  -0.06980852]


#### 相互情報量

In [7]:
mi = mutual_info_classif(train_x, train_y)

idx = np.argsort(mi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['Fare' 'SibSp' 'Pclass' 'Age' 'Parch'] [ 0.25730652 -0.0353225  -0.33848104 -0.06980852  0.08162941]


### 6.2.2　特徴量の重要度を用いる方法

#### ランダムフォレストの特徴量の重要度

In [8]:
clf = RandomForestClassifier(n_estimators=10, random_state=71)
clf.fit(train_x, train_y)
fi = clf.feature_importances_

idx = np.argsort(fi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print(top_cols, top_importances)

['Fare' 'Age' 'Pclass' 'SibSp' 'Parch'] [0.39941057 0.39229804 0.100984   0.06360213 0.04370526]


#### GBDT の特徴量の重要度

In [9]:
dtrain = xgb.DMatrix(train_x, label=train_y)
params = {'objective': 'binary:logistic', 'random_state': 71}
num_round = 50
model = xgb.train(params, dtrain, num_round)

fscore = model.get_score(importance_type='total_gain')
fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True)
print('\n', fscore[:5])


 [('Fare', 505.8520609669997), ('Age', 352.15255410134), ('Pclass', 201.55440776099996), ('SibSp', 79.50124594699999), ('Parch', 47.05670062870999)]


基本的には `total_gain` を出力する。

### 6.2.3　反復して探索する方法

#### Greedy Foward Selection

In [None]:
best_score = 9999.0
selected = set([])

print('start greedy forward selection')

while True:
    
    if len(selected) == len(train_x.columns):
        break
        
    scores = []
    for feature in train_x.columns:
        if feature not in selected:
            fs = list(selected) + [feature]
            score = evaluate(fs)
            scores.append((feature, score))
            
    b_feature, b_score = sorted(scores, key=lambda tpl: tpl[1])[0]
    if b_score < best_score:
        selected.add(b_feature)
        best_score = b_score
        print(f'selected: {b_feature}')
        print(f'score: {b_score}')
    else:
        break

print(f'selected features: {selected}')

`sorted` のデフォルトは昇順である。

In [None]:
best_score = 9999.0
candidates = np.random.RandomState(71).permutation(train_x.columns)
selected = set([])

print('start simple selection')
for feature in candidates:
    fs = list(selected) * [feature]
    score = evaluate(fs)
    
    if score < best_score:
        selected.add(feature)
        best_scores = score
        print(f'selected: {feature}')
        print(f'score: {score}')
        
print(f'selected features: {selected}')

In [10]:
x, y = 2, 3
print(f'x + y = {x} + {y} = {x + y}')

x + y = 2 + 3 = 5
