In [18]:
from mnist import MNIST
import numpy as np

# Read all EMNIST test and train data
mndata = MNIST('data')
X_train, y_train = mndata.load('data/emnist-byclass-train-images-idx3-ubyte', 
                               'data/emnist-byclass-train-labels-idx1-ubyte')
X_test, y_test = mndata.load('data/emnist-byclass-test-images-idx3-ubyte', 
                             'data/emnist-byclass-test-labels-idx1-ubyte')
# Read mapping of the labels and convert ASCII values to chars
mapping = []
with open('data/emnist-byclass-mapping.txt') as f:
    for line in f:
        mapping.append(chr(int(line.split()[1])))

# Convert data to numpy arrays and normalize images to the interval [0, 1]
X_train = np.array(X_train) / 255
y_train = np.array(y_train)
X_test = np.array(X_test) / 255
y_test = np.array(y_test)

## Let's start with Machine Learning!
### Random Forests are cool, let's use them

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
cv_scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Mean accuracy: ', cv_scores.mean())
print('      Std dev: ', cv_scores.std())

Mean accuracy:  0.788464974142
      Std dev:  0.00166719277345


Now, if we have enough memory, we can use scikit-learn's GridSearchCV to optimize the RandomForestClassifier parameters (n_estimators, in this case). Since this will use A LOT of memory (and likely raise a MemoryError), we can try a few more examples by hand.

This is done below, with 25 and 50 estimators respectively.

In [None]:
clf = RandomForestClassifier(n_estimators=25, n_jobs=-1)
cv_scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Mean accuracy: ', cv_scores.mean())
print('      Std dev: ', cv_scores.std())

In [19]:
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
cv_scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Mean accuracy: ', cv_scores.mean())
print('      Std dev: ', cv_scores.std())

Mean accuracy:  0.822484214604
      Std dev:  0.00147729065302


A Random Forest with 50 estimators seems to do a good job, so let's confirm it by evaluating the accuracy of this model when using the EMNIST test set.

In [20]:
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy on test set: ', test_accuracy)

Accuracy on test set:  0.825804011245


## Now let's do some Deep Learning!
### Convolution Neural Networks are the way to go here

First, let's set up our CNN model.

In [None]:
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D, Dropout, Flatten, Dense

def build_model(nb_classes, nb_filters, kernel_size, pool_size, input_shape):
    model = Sequential()
    model.add(Convolution2D(int(nb_filters / 2), kernel_size, padding='valid',
                            input_shape=input_shape, activation='relu',
                            kernel_initializer='he_normal', data_format = 'channels_first'))
    model.add(MaxPooling2D(pool_size=pool_size))
    model.add(Convolution2D(nb_filters, kernel_size, activation='relu', 
                            kernel_initializer='he_normal', data_format = 'channels_first'))
    model.add(MaxPooling2D(pool_size=pool_size))
    
    model.add(Flatten())
    model.add(Dense(250, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(125, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes, activation='softmax', kernel_initializer='he_normal'))

    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
    print(model.summary())
    
    return model

Then we should set the parameters to be used with our CNN and preprocess the data a bit further in order for it to be in the necessary shape.

In [7]:
from keras.utils import np_utils

# Number of classes in the train set
nb_classes = len(mapping)
# Number of convolutional filters
nb_filters = 32
# Convolutional kernel size
kernel_size = (5, 5) # convolution kernel size
# Size of pooling area
pool_size = (2, 2)
# Shape of the images (color channels, width, height)
input_shape = (1, 28, 28)

# Reshape data to be used in a Convolutional Neural Network
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)

# One-hot encoding of the label arrays
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)

Using TensorFlow backend.


Now we can finally build our CNN model and fit it with the EMNIST training data.

In [11]:
model = build_model(nb_classes, nb_filters, kernel_size, pool_size, input_shape)
model.fit(X_train, y_train, batch_size=128, epochs=20)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 16, 24, 24)        416       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 12, 24)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 8, 20)         6432      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 4, 20)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1280)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               320250    
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
__________

After trained, we can evaluate our CNN with the test data.

In [16]:
test_eval = model.evaluate(X_test, y_test)
print()
print('    Test set loss:', test_eval[0])
print('Test set accuracy:', test_eval[1])

    Test set loss: 0.431493847698
Test set accuracy: 0.846651135202
