In [1]:
## toy dataset for classification of patient being diabetic
## uses SelectFromModel + Random Forest

In [2]:
import numpy as np
import pandas as pd

In [3]:
orig = pd.read_csv('pima-indians-diabetes.csv')

print(orig.shape)
orig.head()

(768, 9)


Unnamed: 0,Pregnant,Glucose,BloodP,Triceps,Insulin,BMI,DPF,Age,Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [5]:
## Note:
## Data normalisation is not required for Random Forest
## reference url: https://stackoverflow.com/questions/8961586/do-i-need-to-normalize-or-scale-data-for-randomforest-r-package#:~:text=No%2C%20scaling%20is%20not%20necessary,%2C%20aren't%20so%20important.

In [6]:
target = orig.Diabetic

features = orig.iloc[:, :-1]

x_train, x_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.20, random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


## sklearn SelectFromModel

In [7]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 300))
sel.fit(x_train, y_train)

sel.get_support()

array([False,  True, False, False, False,  True,  True,  True])

In [8]:
##  features whose importance is greater than the mean importance 
print(x_train.columns[(sel.get_support())])

##  features whose importance is NOT greater than the mean importance 
print(x_train.columns[~(sel.get_support())])

Index(['Glucose', 'BMI', 'DPF', 'Age'], dtype='object')
Index(['Pregnant', 'BloodP', 'Triceps', 'Insulin'], dtype='object')


In [9]:
x_train_sel = x_train.iloc[:, sel.get_support()]
x_test_sel = x_test.iloc[:, sel.get_support()]

print(x_train_sel.shape, x_test_sel.shape)

(614, 4) (154, 4)


## Random Forest

In [10]:
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(x_train_sel, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
print("Random Forest training mean accuracy: %.4f" % rfc.score(x_train_sel, y_train))
print("Random Forest test mean accuracy: %.4f" % rfc.score(x_test_sel, y_test))

Random Forest training mean accuracy: 1.0000
Random Forest test mean accuracy: 0.7792


In [None]:
## Results:
## No signigicant improvement relative to script 001
## Random forest is still overfitting on training data (accuracy difference between train and test)
## SelectFromModel is useful and is an alternative to sklearn.feature_selection.SelectKBest
## SelectFromModel does not require specifying K

In [12]:
## Referenced tutorial:
## https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f