# 学習済みモデルから説明変数の重要度を出力するサンプル

In [1]:

# Import data and modules
import pandas as pd
import numpy as np
from sklearn import datasets
import warnings
warnings.filterwarnings('ignore')
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

iris = datasets.load_iris()


Populating the interactive namespace from numpy and matplotlib


In [2]:
print ("特徴量名：",iris.feature_names)
print ("目的クラス名:", iris.target_names)

特徴量名： ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
目的クラス名: ['setosa' 'versicolor' 'virginica']


In [3]:
iris_features = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_target = pd.DataFrame(iris.target, columns=["classes"])

## 多値分類後に特徴量の重要度を抽出

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_features, iris_target, test_size=.3, random_state=0)

print('訓練データ数 {}, テストデータ数 {}'.format(
X_train.shape[0], X_test.shape[0]))
print()

訓練データ数 105, テストデータ数 45



StandardScalerにより平均0, 分散1の値に変換

In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

sc.fit(X_train)

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

print('特徴量の変換を確認, 分類タスクのため目的変数は正規化しない:\n')
print(pd.DataFrame(X_train_std, columns=iris_features.columns).head())

特徴量の変換を確認, 分類タスクのため目的変数は正規化しない:

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -1.023664         -2.378463          -0.182950         -0.291459
1           0.695175         -0.101903           0.930661          0.737219
2           0.924353          0.581065           1.042022          1.637313
3           0.122229         -1.923151           0.652258          0.351465
4           0.924353         -1.240183           1.097702          0.737219


多値分類が可能な手法, LogisticRegressionとRandomForestClassifierを適用

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [7]:
from sklearn.model_selection import GridSearchCV
params = {'n_estimators':list(range(5,90,10))+[100] ,'min_samples_split': list(range(2,11))}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5)
search_result = grid.fit(X_train_std, y_train)
best = search_result.best_estimator_
print ("ベストモデル:", best)

ベストモデル: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [8]:
from sklearn import metrics

# テストデータを予測
y_pred = best.predict(X_test_std)

# "精度"の計算
print("Random Forestでの精度:", metrics.accuracy_score(y_test,y_pred))

Random Forestでの精度: 0.9777777777777777


In [9]:
random_forest_importance=pd.DataFrame(columns=iris_features.columns)
random_forest_importance.loc[0] = best.feature_importances_
print ('Random Forestの特徴量:\n', random_forest_importance)

Random Forestの特徴量:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.088016          0.011666           0.417301          0.483017


LogisticRegressionでの特徴量抽出

In [10]:
from sklearn.model_selection import GridSearchCV
params_lg = {'C':[10**i for i in range(-3, 2)]}
grid_lg = GridSearchCV(LogisticRegression(penalty='l2'), params_lg, cv=5)
search_result_lg = grid_lg.fit(X_train_std, y_train)
best_lg = search_result_lg.best_estimator_
print ("ベストモデル:", best_lg)

ベストモデル: LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [11]:
from sklearn import metrics

# テストデータを予測
y_pred = best_lg.predict(X_test_std)
#y_pred_p = best_lg.predict_proba(X_test_std)

# "精度"の計算
print("LogisticRegressionでの精度:", metrics.accuracy_score(y_test,y_pred))

LogisticRegressionでの精度: 0.9555555555555556


In [12]:
lg_importance=pd.DataFrame(columns=iris_features.columns)
for i in range(len(best_lg.coef_)):
    lg_importance.loc[i+1] = best_lg.coef_[i]
print ('Logistic Regressionの特徴量(One vs Rest):\n', lg_importance)

Logistic Regressionの特徴量(One vs Rest):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
1          -1.010750          1.850233          -2.685118         -2.382632
2          -0.143393         -1.129107           1.866484         -1.787549
3          -0.802724         -0.741070           4.914449          4.536193
