In [3]:
import sklearn as skl
from sklearn import cross_validation, svm, metrics

In [4]:
# THE MNIST DATABASE of handwritten digits: http://yann.lecun.com/exdb/mnist/

In [5]:
import struct
import os
import gzip  

def load_mnist(datapath, dataset):
    label_filename = os.path.join(datapath, dataset + "-labels-idx1-ubyte.gz")
    image_filename = os.path.join(datapath, dataset + "-images-idx3-ubyte.gz")
    
    label_file = gzip.open(label_filename, 'rb')
    image_file = gzip.open(image_filename, 'rb')

    magic, label_count = struct.unpack(">II", label_file.read(8))
    magic, image_count = struct.unpack(">II", image_file.read(8))
    row_count, col_count = struct.unpack(">II", image_file.read(8))

    pixel_count = row_count * col_count

    data = {
        'labels': [],
        'images': []
    }
    for idx in range(label_count):
        label = struct.unpack("B", label_file.read(1))[0]
        image_data = image_file.read(pixel_count)
        image_data = list(map(lambda n: int(n)/256, image_data))
        data['labels'].append(label)
        data['images'].append(image_data)
    label_file.close()
    image_file.close()
    
    return data

In [6]:
train_data = load_mnist('./mnist/', 'train')
print(len(train_data['labels']))
print(len(train_data['images']))

60000
60000


In [7]:
test_data = load_mnist('./mnist/', 't10k')
print(len(test_data['labels']))
print(len(test_data['images']))

10000
10000


In [8]:
# Try feature engineering
def compose_features(train_x, test_x):
    for image in train_x:
        for i in range(28):
            c = 0
            for j in range(28):
                c += image[i+j*28]
            image.append(c/28)
    for image in test_x:
        for i in range(28):
            c = 0
            for j in range(28):
                c += image[i+j*28]
            image.append(c/28)

In [9]:
def test_classifier(clf, func_compose_feature=None):
    train_data = load_mnist('./mnist/', 'train')
    test_data = load_mnist('./mnist/', 't10k')
    train_x = [image[:] for image in train_data['images']]
    test_x = [image[:] for image in test_data['images']]

    # Feature engineering example, this is optional
    if func_compose_feature:
        func_compose_feature(train_x, test_x)
    
    # Train the classifier
    clf.fit(train_x, train_data['labels'])
    
    # Validate the classifier
    predicts = clf.predict(test_x)
    score = skl.metrics.accuracy_score(test_data["labels"], predicts)
    report = skl.metrics.classification_report(test_data["labels"], predicts)
    print(score)
    print(report)

In [60]:
from sklearn.ensemble import RandomForestClassifier  
rfc = RandomForestClassifier(n_estimators=8)  
test_classifier(rfc)

0.9428
             precision    recall  f1-score   support

          0       0.94      0.99      0.96       980
          1       0.98      0.99      0.98      1135
          2       0.92      0.95      0.93      1032
          3       0.91      0.93      0.92      1010
          4       0.94      0.94      0.94       982
          5       0.92      0.92      0.92       892
          6       0.96      0.94      0.95       958
          7       0.96      0.95      0.95      1028
          8       0.95      0.90      0.92       974
          9       0.95      0.91      0.93      1009

avg / total       0.94      0.94      0.94     10000



In [68]:
rfc = RandomForestClassifier(n_estimators=8)  
test_classifier(rfc, compose_features)

0.9427
             precision    recall  f1-score   support

          0       0.96      0.99      0.97       980
          1       0.98      0.99      0.99      1135
          2       0.92      0.95      0.94      1032
          3       0.91      0.93      0.92      1010
          4       0.93      0.95      0.94       982
          5       0.93      0.93      0.93       892
          6       0.96      0.96      0.96       958
          7       0.96      0.93      0.94      1028
          8       0.93      0.90      0.91       974
          9       0.94      0.91      0.93      1009

avg / total       0.94      0.94      0.94     10000



In [None]:
clf_svm = skl.svm.SVC()
test_classifier(clf_svm)

0.9443
             precision    recall  f1-score   support

          0       0.96      0.99      0.97       980
          1       0.97      0.99      0.98      1135
          2       0.94      0.93      0.93      1032
          3       0.93      0.94      0.93      1010
          4       0.93      0.96      0.94       982
          5       0.93      0.91      0.92       892
          6       0.95      0.97      0.96       958
          7       0.96      0.93      0.94      1028
          8       0.94      0.92      0.93       974
          9       0.94      0.92      0.93      1009

avg / total       0.94      0.94      0.94     10000



In [None]:
from sklearn.linear_model import LogisticRegression  
lr = LogisticRegression(penalty='l2')
test_classifier(lr)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier  
gbdt = GradientBoostingClassifier(n_estimators=200)
test_classifier(gbdt)

0.9595
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       980
          1       0.98      0.99      0.98      1135
          2       0.96      0.95      0.95      1032
          3       0.95      0.95      0.95      1010
          4       0.96      0.97      0.97       982
          5       0.95      0.95      0.95       892
          6       0.97      0.96      0.96       958
          7       0.97      0.93      0.95      1028
          8       0.94      0.96      0.95       974
          9       0.94      0.95      0.94      1009

avg / total       0.96      0.96      0.96     10000

