# MNIST Random Forest Classification
We compute a patch based RF approach to classify the MNIST database. This script plots the MNIST data to be used.

First, import the relevant libraries and functions.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import image
from sklearn.decomposition import PCA
import numpy as np

Load the MNIST dataset and randomly permute it. Note that older versions of scikit-learn have issues with JSON errors. Had to use cache=False in fetch_openml(), see [issue 382](https://github.com/skorch-dev/skorch/issues/382).

In [None]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255. #norm grayscale values
print(X.shape)

random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], 28, 28))
print(X.shape)

train_samples = 50000
test_samples = 10000
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_samples, test_size=test_samples)

(70000, 784)
(70000, 28, 28)


Extract patches from each image, apply PCA to reduce the information in the patches as eigen-patches per training image.

In [None]:
n_components = 12
#precision = 0.95
patch_size = (7, 7)
training_eigenpatches = []
training_labels = []
for index in range(train_samples):
  patches = image.extract_patches_2d(X_train[index], patch_size)
  #print(patches.shape)
  patches = patches.reshape((patches.shape[0], patches.shape[1]*patches.shape[2]))
  #print(patches.shape)
  pca = PCA(n_components=n_components)
  pca.fit(patches)
  eigenpatches = pca.components_
  #print("Eigenpatch shape:", eigenpatches.shape)
  training_eigenpatches.append(eigenpatches)
  training_labels.append(y_train[index])
training_eigenpatches = np.array(training_eigenpatches)
training_eigenpatches = training_eigenpatches.reshape((training_eigenpatches.shape[0], training_eigenpatches.shape[1]*training_eigenpatches.shape[2]))
training_labels = np.array(training_labels)
print("Training patches shape:", training_eigenpatches.shape)
print("training_labels:", training_labels)

Training patches shape: (50000, 588)
training_labels: ['8' '9' '6' ... '9' '0' '9']


Then train a RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, max_depth=None, random_state=0)
clf.fit(training_eigenpatches, training_labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Finally, classify the test images using the same eigen-patches features computed on these images.

In [None]:
testing_eigenpatches = []
actual_labels = []
for index in range(test_samples):
  patches = image.extract_patches_2d(X_test[index], patch_size)
  #print(patches.shape)
  patches = patches.reshape((patches.shape[0], patches.shape[1]*patches.shape[2]))
  #print(patches.shape)
  pca = PCA(n_components=n_components)
  pca.fit(patches)
  eigenpatches = pca.components_
  #print("Eigenpatch shape:", eigenpatches.shape)
  testing_eigenpatches.append(eigenpatches)
  actual_labels.append(y_test[index])
testing_eigenpatches = np.array(testing_eigenpatches)
testing_eigenpatches = testing_eigenpatches.reshape((testing_eigenpatches.shape[0], testing_eigenpatches.shape[1]*testing_eigenpatches.shape[2]))
actual_labels = np.array(actual_labels)
print("Training patches shape:", testing_eigenpatches.shape)
print("actual_labels:", actual_labels)

Training patches shape: (10000, 588)
actual_labels: ['9' '0' '8' ... '0' '4' '9']


Report the classification performance

In [None]:
predictions = clf.predict(testing_eigenpatches)

from sklearn.metrics import classification_report

print(classification_report(actual_labels, predictions))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       947
           1       0.98      0.97      0.98      1066
           2       0.78      0.78      0.78      1030
           3       0.77      0.81      0.79       990
           4       0.82      0.79      0.81       938
           5       0.82      0.79      0.80       954
           6       0.89      0.88      0.89       999
           7       0.89      0.85      0.87      1044
           8       0.80      0.84      0.82      1009
           9       0.84      0.83      0.84      1023

    accuracy                           0.85     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.85      0.85      0.85     10000

