In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import time
%matplotlib inline

we will be using MNIST image dataset for comparing two models.

In [2]:
df_train = pd.read_csv('C:/Users/vivek/Downloads/train.csv')
df_test = pd.read_csv('C:/Users/vivek/Downloads/test.csv')

## PCA

In [3]:
pca = PCA(.95)
X = df_train.drop('label', 1)
Y = df_train['label']
pca.fit(X)
print(X.shape)
print(Y.shape)

(42000, 784)
(42000,)


In [4]:
df_train_pca = pca.transform(X)
df_test_pca = pca.transform(df_test)

In [5]:
print(df_train_pca.shape)
print(df_test_pca.shape)

(42000, 154)
(28000, 154)


Here, we used PCA with 95% of variance and it reduced feature size from 784 to 154. It means 154 features contains 95% knowledge of 784 features.

In [6]:
decision_tree = tree.DecisionTreeClassifier(criterion='entropy')

## Decision Tree

In [7]:
start = time.time()
decision_tree.fit(df_train_pca, Y)
Y_pred = decision_tree.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(decision_tree, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 60.99534726142883 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4132
           1       1.00      1.00      1.00      4684
           2       1.00      1.00      1.00      4177
           3       1.00      1.00      1.00      4351
           4       1.00      1.00      1.00      4072
           5       1.00      1.00      1.00      3795
           6       1.00      1.00      1.00      4137
           7       1.00      1.00      1.00      4401
           8       1.00      1.00      1.00      4063
           9       1.00      1.00      1.00      4188

   micro avg       1.00      1.00      1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Cross Validation: 
[0.83238231 0.8320647  0.82746311 0.827184   0.81642857 0.82614908
 0.82019528 0.8208244  0.82411821 0.83031459]
Confusion Matrix: 
[[4132    0    0    0    0    0 

## Random Forest

In [8]:
start = time.time()
rfc = ensemble.RandomForestClassifier(n_estimators=1)
rfc.fit(df_train_pca, Y)
Y_pred = rfc.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(rfc, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 1.598703145980835 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4132
           1       0.96      0.97      0.97      4684
           2       0.85      0.84      0.84      4177
           3       0.85      0.84      0.85      4351
           4       0.84      0.84      0.84      4072
           5       0.81      0.82      0.81      3795
           6       0.88      0.89      0.89      4137
           7       0.88      0.89      0.89      4401
           8       0.83      0.83      0.83      4063
           9       0.85      0.84      0.84      4188

   micro avg       0.87      0.87      0.87     42000
   macro avg       0.86      0.86      0.86     42000
weighted avg       0.87      0.87      0.87     42000

Cross Validation: 
[0.63623395 0.61988582 0.66039981 0.61318734 0.64261905 0.61205049
 0.63276971 0.58494162 0.59961868 0.62583413]
Confusion Matrix: 
[[3645   17   67   70   27   75 

Random forest with n_estimators=1 is not as accurate as decision tree but let's check by increasing the number of n_estimators.

In [9]:
start = time.time()
rfc = ensemble.RandomForestClassifier(n_estimators=10)
rfc.fit(df_train_pca, Y)
Y_pred = rfc.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(rfc, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 13.518974304199219 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4132
           1       1.00      1.00      1.00      4684
           2       1.00      1.00      1.00      4177
           3       1.00      1.00      1.00      4351
           4       1.00      1.00      1.00      4072
           5       1.00      1.00      1.00      3795
           6       1.00      1.00      1.00      4137
           7       1.00      1.00      1.00      4401
           8       1.00      1.00      1.00      4063
           9       1.00      1.00      1.00      4188

   micro avg       1.00      1.00      1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Cross Validation: 
[0.87779363 0.88510942 0.88505474 0.87360152 0.88309524 0.87878066
 0.87854251 0.87705504 0.88250715 0.88036225]
Confusion Matrix: 
[[4132    0    0    0    0    0

We see major difference in accuracy with increase of estimators from 1 to 10.

In [10]:
start = time.time()
rfc = ensemble.RandomForestClassifier(n_estimators=20)
rfc.fit(df_train_pca, Y)
Y_pred = rfc.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(rfc, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 10.456046104431152 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4132
           1       1.00      1.00      1.00      4684
           2       1.00      1.00      1.00      4177
           3       1.00      1.00      1.00      4351
           4       1.00      1.00      1.00      4072
           5       1.00      1.00      1.00      3795
           6       1.00      1.00      1.00      4137
           7       1.00      1.00      1.00      4401
           8       1.00      1.00      1.00      4063
           9       1.00      1.00      1.00      4188

   micro avg       1.00      1.00      1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Cross Validation: 
[0.91464574 0.91222645 0.91932413 0.92073316 0.91404762 0.91259824
 0.91021672 0.91398618 0.91825548 0.91301239]
Confusion Matrix: 
[[4132    0    0    0    0    0

In [11]:
start = time.time()
rfc = ensemble.RandomForestClassifier(n_estimators=30)
rfc.fit(df_train_pca, Y)
Y_pred = rfc.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(rfc, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 15.481579303741455 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4132
           1       1.00      1.00      1.00      4684
           2       1.00      1.00      1.00      4177
           3       1.00      1.00      1.00      4351
           4       1.00      1.00      1.00      4072
           5       1.00      1.00      1.00      3795
           6       1.00      1.00      1.00      4137
           7       1.00      1.00      1.00      4401
           8       1.00      1.00      1.00      4063
           9       1.00      1.00      1.00      4188

   micro avg       1.00      1.00      1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Cross Validation: 
[0.92724679 0.92602284 0.92717753 0.92549393 0.92738095 0.92283877
 0.92783996 0.92041935 0.93231649 0.9292183 ]
Confusion Matrix: 
[[4132    0    0    0    0    0

In [12]:
start = time.time()
rfc = ensemble.RandomForestClassifier(n_estimators=60)
rfc.fit(df_train_pca, Y)
Y_pred = rfc.predict(df_train_pca)
print("--- %s seconds ---" % (time.time() - start))
print("Classification report: ")
print(classification_report(Y, Y_pred))
print("Cross Validation: ")
print(cross_val_score(rfc, df_train_pca, Y, cv=10))
print("Confusion Matrix: ")
print(confusion_matrix(Y, Y_pred))

--- 30.78776717185974 seconds ---
Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4132
           1       1.00      1.00      1.00      4684
           2       1.00      1.00      1.00      4177
           3       1.00      1.00      1.00      4351
           4       1.00      1.00      1.00      4072
           5       1.00      1.00      1.00      3795
           6       1.00      1.00      1.00      4137
           7       1.00      1.00      1.00      4401
           8       1.00      1.00      1.00      4063
           9       1.00      1.00      1.00      4188

   micro avg       1.00      1.00      1.00     42000
   macro avg       1.00      1.00      1.00     42000
weighted avg       1.00      1.00      1.00     42000

Cross Validation: 
[0.93794579 0.93101808 0.93955259 0.93834801 0.94428571 0.93188854
 0.93117409 0.93614487 0.94518589 0.94327931]
Confusion Matrix: 
[[4132    0    0    0    0    0 

More number of estimators makes the random forest model more accurate than decision tree.

In conclusion we can say that random forest is faster than decision tree and it is more accurate.