In [1]:
## toy dataset for classification of patient being diabetic, using Random Forest

In [2]:
import numpy as np
import pandas as pd

In [3]:
orig = pd.read_csv('pima-indians-diabetes.csv')

print(orig.shape)
orig.head()

(768, 9)


Unnamed: 0,Pregnant,Glucose,BloodP,Triceps,Insulin,BMI,DPF,Age,Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df = orig.copy()

## Normalise values

In [5]:
from sklearn.preprocessing import MinMaxScaler

In [6]:
target = df.Diabetic

features = df.iloc[:, :-1].values

print(target.shape)
print(features.shape)
print(features[0])

(768,)
(768, 8)
[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]


In [7]:
np.set_printoptions(precision=2, suppress=True)

mms = MinMaxScaler(feature_range=(0, 1))

features_scaled = mms.fit_transform(features)

features_scaled[0]

array([0.35, 0.74, 0.59, 0.35, 0.  , 0.5 , 0.23, 0.48])

## Feature selection

In [10]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [56]:
# using chi square for feature selection
mkbest = SelectKBest(chi2, k='all').fit_transform(features_scaled, target)

print(mkbest.shape)
mkbest[:2]

(768, 8)


array([[0.35, 0.74, 0.59, 0.35, 0.  , 0.5 , 0.23, 0.48],
       [0.06, 0.43, 0.54, 0.29, 0.  , 0.4 , 0.12, 0.17]])

## Train Test Split

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
x_train, x_test, y_train, y_test = train_test_split(mkbest, target,
                                                    test_size=0.20, random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


## Random Forest Algo Round 1

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [61]:
print("Random Forest training mean accuracy: %.4f" % rfc.score(x_train, y_train))
print("Random Forest test mean accuracy: %.4f" % rfc.score(x_test, y_test))

Random Forest training mean accuracy: 1.0000
Random Forest test mean accuracy: 0.8117


## Check feature importance

In [62]:
feature_importances_df = pd.DataFrame(
    {"feature": list(df.columns[:-1]), "importance": rfc.feature_importances_}
).sort_values("importance", ascending=False)

# Display
feature_importances_df

Unnamed: 0,feature,importance
1,Glucose,0.247788
5,BMI,0.17037
7,Age,0.143263
6,DPF,0.126667
2,BloodP,0.089869
0,Pregnant,0.082775
4,Insulin,0.070723
3,Triceps,0.068544


In [45]:
mkbest.shape

(768, 8)

In [48]:
df.columns[:-1]

Index(['Pregnant', 'Glucose', 'BloodP', 'Triceps', 'Insulin', 'BMI', 'DPF',
       'Age'],
      dtype='object')

## Extra: standard Scaler

In [63]:
from sklearn.preprocessing import StandardScaler

In [64]:
scaler = StandardScaler()

features_ss = df.iloc[:, :-1]

features_ss = scaler.fit_transform(features_ss)

features_ss[0]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([ 0.64,  0.85,  0.15,  0.91, -0.69,  0.2 ,  0.47,  1.43])

In [66]:
## Referenced tutorial:
## https://www.freecodecamp.org/news/how-to-use-the-tree-based-algorithm-for-machine-learning/ 