# Boruta SHAP: A Tool for Feature Selection

In [221]:
pip install boruta




In [222]:
from boruta import BorutaPy

###### Note: Always give X and Y in arrays only - if you are using BorutaPy

In [223]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [224]:
#def load_data():
    # URLS for dataset via UCI
train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
X_data = pd.read_csv(train_data_url, sep=" ", header=None)
y_data = pd.read_csv(train_label_url, sep=" ", header=None)

In [225]:
print(X_data.shape)
print(y_data.shape)

(2000, 501)
(2000, 1)


In [226]:
data = X_data.loc[:, :499]
data['target'] = y_data[0]

In [227]:
X = data.iloc[:, :-1]
y = data.iloc[:,-1]
data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,-1
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,-1


In [228]:
# Splits the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

###### Random Forest Classifier

In [229]:
rfc = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, min_samples_split=8,random_state=0)

In [230]:
rfc_model = rfc.fit(X_train, y_train)

###### Accuracy Check

In [231]:
from sklearn.metrics import accuracy_score
rfc_model_predict =rfc_model.predict(X_test)
print('Random Forest Basic Accuracy : ' , accuracy_score(y_test, rfc_model_predict))

Random Forest Basic Accuracy :  0.69


###### Top 10 Feature Importance based on Random Forest Classifer - 

In [232]:
Col = X.columns
rfc.feature_importances_
col_imp = pd.DataFrame(Col, columns = ['Column Name'])
col_imp['Importance'] = rfc.feature_importances_

In [233]:
col_imp.sort_values('Importance', ascending=False).head(30)

Unnamed: 0,Column Name,Importance
475,475,0.030027
338,338,0.024086
241,241,0.020665
105,105,0.017017
442,442,0.015837
64,64,0.015315
336,336,0.014541
128,128,0.013961
472,472,0.012751
48,48,0.01012


###### Boruta

###### Always give X and Y in Boruta model as in Array

In [234]:
# Define Boruta feature selection method
from boruta import BorutaPy
import numpy as np
boruta1 = BorutaPy(estimator = rfc, n_estimators = 'auto')
boruta1.fit(np.array(X_train), np.array(y_train))

BorutaPy(estimator=RandomForestClassifier(max_depth=7, min_samples_split=8,
                                          n_estimators=92, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x24892928540),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x24892928540)

###### Boruta gives you feature importance in three categories

In [235]:
print(boruta1.get_params)

<bound method BaseEstimator.get_params of BorutaPy(estimator=RandomForestClassifier(max_depth=7, min_samples_split=8,
                                          n_estimators=92, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x24892928540),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x24892928540)>


In [236]:
# Important columns by boturo
important_col_by_boruto = list(X_train.columns[boruta1.support_])
important_col_by_boruto

[4,
 28,
 48,
 64,
 105,
 128,
 153,
 241,
 281,
 318,
 336,
 338,
 378,
 433,
 442,
 451,
 453,
 472,
 475,
 493]

In [237]:
# tentative columns by boruto
tentative_column = list(X.columns[[boruta1.support_weak_]])
tentative_column

  result = getitem(key)


[]

In [238]:
# Adding a tentative columns to important columns
important_col_by_boruto.append(334)

In [239]:
df_imp_col_boturo = pd.DataFrame(important_col_by_boruto)
df_imp_col_boturo.shape[0]

21

In [240]:
# All columns RFC in list
rfc_imp_col = list(imp_col['Column Name'])

In [241]:
imp_col = col_imp.sort_values('Importance', ascending=False).head(df_imp_col_boturo.shape[0])

In [242]:
df_imp_columns = pd.concat([imp_col, df_imp_col_boturo.set_index(imp_col.index)], axis=1)
df_imp_columns

Unnamed: 0,Column Name,Importance,0
475,475,0.030027,4
338,338,0.024086,28
241,241,0.020665,48
105,105,0.017017,64
442,442,0.015837,105
64,64,0.015315,128
336,336,0.014541,153
128,128,0.013961,241
472,472,0.012751,281
48,48,0.01012,318


In [243]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [245]:
X_train_boruto_col = X_train[important_col_by_boruto]
X_train_RFC_col = X_train[rfc_imp_col]

In [246]:
X_train_boruto_col = X_train[important_col_by_boruto]
X_test_boruto_col = X_test[important_col_by_boruto]

X_train_rfc_col=X_train[rfc_imp_col]
X_test_rfc_col = X_test[rfc_imp_col]

###### Training Model on RFC and Boruto Important Columns

In [247]:
X_train_boruto_col.shape

(1600, 21)

In [248]:
rfc_boruto = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, min_samples_split=8,random_state=0)
rfc_rfc_col = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, min_samples_split=8,random_state=0)

In [254]:
rfc_rfc_model = rfc_rfc_col.fit(X_train_rfc_col,y_train)
rfc_rfc_model_predict = rfc_rfc_model.predict(X_test_rfc_col)
print('Accuracy with RFC Imp Columns  : ', accuracy_score(y_test, rfc_rfc_model_predict))

Accuracy with RFC Imp Columns  :  0.875


In [255]:
print(rfc_rfc_model.set_params)

<bound method BaseEstimator.set_params of RandomForestClassifier(max_depth=7, min_samples_split=8, n_jobs=-1,
                       random_state=0)>


In [252]:
rfc_boruto_model = rfc_boruto.fit(X_train_boruto_col,y_train)
rfc_boruto_model_predict = rfc_boruto_model.predict(X_test_boruto_col)
print('Accuracy with Boruto Imp Columns  : ', accuracy_score(y_test, rfc_boruto_model_predict))

Accuracy with Boruto Imp Columns  :  0.8775


In [257]:
print(rfc_boruto_model.set_params)

<bound method BaseEstimator.set_params of RandomForestClassifier(max_depth=7, min_samples_split=8, n_jobs=-1,
                       random_state=0)>
