# Random Forest

In [2]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

In [3]:
glass = pd.read_csv("glassClass.csv")

glass.head(7)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
5,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1
6,1.51743,13.3,3.6,1.14,73.09,0.58,8.17,0.0,0.0,1


In [15]:
Y = glass["Type"]  # target

X = glass.drop("Type", axis=1)  # predictors

type(X)

pandas.core.frame.DataFrame

## Training & Testing Data

We're allowed to pass a DataFrame to `train_test_split`.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state=25)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((171, 9), (43, 9), (171,), (43,))

In [6]:
random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, y_train)

RandomForestClassifier()

### 100% accuracy when we implement the RF model on the data set

In [7]:
random_forest.score(X_train, y_train)

1.0

### Predict the classification of glass based on test predictors

In [8]:
Y_pred = random_forest.predict(X_test) 

Y_pred  # predicted values of classifications

array([2, 3, 2, 1, 5, 5, 2, 1, 1, 7, 2, 6, 5, 1, 2, 6, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 7, 2, 1, 3, 7, 1, 7, 1, 7, 2, 1, 1, 2, 2, 7])

###  How confident is the classifier about each glass type?


The `predict_proba` method returns the class probabilities for each data point.

The method accepts a single argument that corresponds to the data over which the probabilities will be computed.

It returns an array of lists containing the class probabilities for the input data points.

In [9]:
random_forest.predict_proba(X_test)[0:10]

array([[0.34, 0.41, 0.22, 0.01, 0.02, 0.  ],
       [0.41, 0.14, 0.45, 0.  , 0.  , 0.  ],
       [0.24, 0.42, 0.27, 0.06, 0.01, 0.  ],
       [0.9 , 0.06, 0.04, 0.  , 0.  , 0.  ],
       [0.03, 0.09, 0.01, 0.82, 0.05, 0.  ],
       [0.  , 0.27, 0.01, 0.53, 0.17, 0.02],
       [0.07, 0.89, 0.04, 0.  , 0.  , 0.  ],
       [0.38, 0.3 , 0.3 , 0.02, 0.  , 0.  ],
       [0.34, 0.32, 0.1 , 0.02, 0.19, 0.03],
       [0.01, 0.01, 0.02, 0.  , 0.08, 0.88]])

## Confusion matrix

In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, Y_pred)
confusion_matrix

array([[ 9,  3,  0,  0,  0,  0],
       [ 0, 17,  1,  1,  0,  0],
       [ 0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  0],
       [ 1,  0,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  6]])

## Score!

Scores above .8 are generally considered good agreement.

In [11]:
from sklearn.metrics import accuracy_score

# compare with the actual y values, y_test (hold outs) with predicted y
accuracy_score(y_test, Y_pred)


0.8604651162790697

In [12]:
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score(y_test, Y_pred)


0.8013856812933026

## Classification report

In [13]:
from sklearn.metrics import classification_report

report = classification_report(y_test, Y_pred)

print("\nClassification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           1       0.90      0.75      0.82        12
           2       0.85      0.89      0.87        19
           3       0.50      1.00      0.67         1
           5       0.67      1.00      0.80         2
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         6

    accuracy                           0.86        43
   macro avg       0.82      0.89      0.83        43
weighted avg       0.88      0.86      0.86        43



## Feature selection

Recursive Feature Elimination **(RFE)** method is a feature selection approach.

It works by recursively removing attributes and building a model on those attributes that remain.

It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [16]:
from sklearn.feature_selection import RFE

# create the RFE model and select 5 attributes
rfe = RFE(random_forest)  #, 5)

fit = rfe.fit(X, Y)

## Summarize the selection of attributes

In [19]:
print(f"\nNum Features: {fit.n_features_}")

print(f"\nSelected Features: {fit.support_}")

print(f"\nFeature Ranking: {fit.ranking_}")



Num Features: 4

Selected Features: [ True False  True  True False False  True False False]

Feature Ranking: [1 2 1 1 3 4 1 5 6]


## Extra Trees Classifier

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, Y)

importances = forest.feature_importances_
importances

array([0.13810183, 0.09400053, 0.17674265, 0.13515332, 0.08829422,
       0.10027392, 0.13206484, 0.08173555, 0.05363313])

In [22]:
indices = np.argsort(importances)[::-1]

indices

array([2, 0, 3, 6, 5, 1, 4, 7, 8])

In [26]:
X.shape[0], X.shape[1]

(214, 9)

In [23]:
print("\nFeature ranking:\n")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:

1. feature 2 (0.176743)
2. feature 0 (0.138102)
3. feature 3 (0.135153)
4. feature 6 (0.132065)
5. feature 5 (0.100274)
6. feature 1 (0.094001)
7. feature 4 (0.088294)
8. feature 7 (0.081736)
9. feature 8 (0.053633)


In [29]:
from sklearn.model_selection import KFold
from sklearn import model_selection

kfold = model_selection.KFold(n_splits=10)

kfold

KFold(n_splits=10, random_state=None, shuffle=False)

In [30]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(random_forest, X, Y, cv=5)

print("\nScores:\n", scores)


Scores:
 [0.74418605 0.74418605 0.62790698 0.62790698 0.78571429]
