In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [16]:
from sklearn.datasets import load_boston

from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Load data

In [7]:
data = load_boston()

In [11]:
X = data['data']
y = data['target']

In [12]:
data['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Train: Decision Tree

In [128]:
model_dt = DecisionTreeRegressor(max_leaf_nodes=7, random_state=42)
model_dt.fit(X_train, y_train)

pred_dt = model_dt.predict(X_test)
print(mean_absolute_error(y_test, pred_dt))
print(r2_score(y_test, pred_dt))

2.9295239552232513
0.8097063368140078


## feature importance with sklearn instance

In [86]:
model_dt.feature_importances_

array([5.12956739e-02, 3.35270585e-03, 5.81619171e-03, 2.27940651e-06,
       2.71483790e-02, 6.00326256e-01, 1.36170630e-02, 7.06881622e-02,
       1.94062297e-03, 1.24638653e-02, 1.10116089e-02, 9.00872742e-03,
       1.93328464e-01])

## feature importance with my codes

In [156]:
# plt.figure(figsize=(16,12))
# plot_tree(model_dt, node_ids=True)
# plt.show()

In [157]:
n = model_dt.tree_.weighted_n_node_samples[0]
fi = np.zeros(model_dt.tree_.n_features)

for idx, feat in enumerate(model_dt.tree_.feature):
    if feat<0:
        continue
    nt = model_dt.tree_.weighted_n_node_samples[idx]
    ntl = model_dt.tree_.weighted_n_node_samples[model_dt.tree_.children_left[idx]]
    ntr = model_dt.tree_.weighted_n_node_samples[model_dt.tree_.children_right[idx]]
    
    I_t = model_dt.tree_.impurity[idx]
    I_tl = model_dt.tree_.impurity[model_dt.tree_.children_left[idx]]
    I_tr = model_dt.tree_.impurity[model_dt.tree_.children_right[idx]]
    fi[feat] += (nt / n) * ( I_t - (ntl / nt) * I_tl - (ntr / nt)* I_tr )

fi_norm = fi/fi.sum()

In [158]:
np.allclose(fi_norm, model_dt.feature_importances_)

True

# Train: Random Forest

In [144]:
model_rf = RandomForestRegressor(bootstrap=True,random_state=42)
model_rf.fit(X_train, y_train)

pred_rf = model_rf.predict(X_test)
print(mean_absolute_error(y_test, pred_rf))
print(r2_score(y_test, pred_rf))

2.0395392156862746
0.8922527442109116


## feature importance with sklearn instance

In [47]:
model_rf.feature_importances_

array([0.03806177, 0.00175615, 0.00795268, 0.00100426, 0.01554377,
       0.50384493, 0.01383994, 0.06054907, 0.00381091, 0.01566064,
       0.01631341, 0.01215362, 0.30950883])

## feature importance with my codes

In [43]:
# plot_tree(model_dt)
# plt.show()

In [148]:
fi_rf = np.zeros(model_rf.n_features_)
N = 0

for estimator in model_rf.estimators_:
    tree = estimator.tree_
    fi_rf_estimator = np.zeros(tree.n_features)
    n = tree.weighted_n_node_samples[0]
    #N +=n

    for idx, feat in enumerate(tree.feature):
        if feat<0:
            continue
        nt = tree.weighted_n_node_samples[idx]
        ntl = tree.weighted_n_node_samples[tree.children_left[idx]]
        ntr = tree.weighted_n_node_samples[tree.children_right[idx]]

        I_t = tree.impurity[idx]
        I_tl = tree.impurity[tree.children_left[idx]]
        I_tr = tree.impurity[tree.children_right[idx]]
        fi_rf_estimator[feat] += (nt / n) * ( I_t - (ntl / nt) * I_tl - (ntr / nt)* I_tr )  
        
    fi_rf_estimator_norm = fi_rf_estimator/fi_rf_estimator.sum()
    
    fi_rf += fi_rf_estimator_norm

In [159]:
np.allclose(model_rf.feature_importances_, fi_rf/model_rf.n_estimators)

True