## Default

In [1]:
# libraries
from sklearn.linear_model import LogisticRegression
import pandas as pd
from io import StringIO
from sklearn.impute import SimpleImputer
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
# get wine data and assign data to X, Y
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
X = df_wine.iloc[:, 1:].values
y = df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Standardization
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

### evaluate each feature

In [None]:
feat_labels = df_wine.columns[1:]                                   # list of full set of features
forest = RandomForestClassifier(n_estimators=500, random_state=1, oob_score=True)              # declare a forest with 500 trees
forest.fit(X_train, y_train)                                        # train model with X_train, Y_train
importances = forest.feature_importances_                           # array of feature importances [0.1370972 , 0.03183888, 0.01466725, 0.02540992, 0.02872538, 0.05191135, 0.15188376, 0.0127449 , 0.01867625, 0.15018238, 0.06242782, 0.12621823, 0.18821667])
indices = np.argsort(importances)[::-1]                             # array of sorted index ([12,  6,  9,  0, 11, 10,  5,  1,  4,  3,  8,  2,  7])
for f in range(X_train.shape[1]):
  print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

print("\nobb score: ", forest.oob_score_, "\n")

# plt.title('Feature Importance')
# plt.bar(range(X_train.shape[1]), importances[indices], align='center')
# plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
# plt.xlim([-1, X_train.shape[1]])
# plt.tight_layout()
# plt.show()

### set threshold to select features

In [4]:
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold', 'criterion:', X_selected.shape[1])
for f in range(X_selected.shape[1]):
  print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

Number of features that meet this threshold criterion: 5
 1) Proline                        0.188217
 2) Flavanoids                     0.151884
 3) Color intensity                0.150182
 4) Alcohol                        0.137097
 5) OD280/OD315 of diluted wines   0.126218


## Permutation

In [5]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [6]:
boston = load_boston()
y = boston.target
X = pd.DataFrame(boston.data, columns = boston.feature_names)
np.random.seed(seed = 1)
X['random'] = np.random.random(size = len(X))
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.8, random_state = 1)

In [None]:
rf = RandomForestRegressor(n_estimators = 100,n_jobs = -1, oob_score = True, bootstrap = True, random_state = 1)
rf.fit(X_train, y_train)

print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train, y_train), rf.oob_score_, rf.score(X_valid, y_valid)))

In [None]:
pip install rfpimp

In [None]:
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
  return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, X_train, y_train, r2)

In [11]:
perm_imp_rfpimp.Importance

Feature
RM         0.917140
LSTAT      0.160363
DIS        0.099580
CRIM       0.020346
PTRATIO    0.019622
AGE        0.015224
B          0.008256
random     0.007056
TAX        0.006548
INDUS      0.004777
NOX        0.003766
RAD        0.002489
ZN         0.001276
CHAS       0.000052
Name: Importance, dtype: float64