# Kaggle Digit Recognition

This is one of the problem proposed on Kaggle: handwritten digit recognition using the MNIST data. We use scikit-learn.

In [None]:
import pandas as pd
from time import clock

We begin by loading the both the training and the testing data.

In [None]:
# Read training data
start = clock()

train_frame = pd.read_csv('data/train.csv')
label = train_frame['label'].values
train = train_frame.iloc[:,1:].values
# train = train.reshape(-1,28,28)

print('Loaded {:d} train entries in {:.0f} seconds.'.format(len(train), clock() - start))

# Train on fewer entries
# label = label[0::10]
# train = train[0::10]

# Read test data 
start = clock()

test_frame = pd.read_csv('data/test.csv')
test = test_frame.values
# test = test.reshape(-1,28,28)

print('Loaded {:d} test entries in {:.0f} seconds.'.format(len(test), clock() - start))

# Select Classifier

There are a few ready-to-use classifier in scikit flow.

In [None]:
# Scikit Flow has the following choices of optimizer: SGD, Adam, Adagrad

# from skflow import TensorFlowLinearClassifier
# clf = TensorFlowLinearClassifier(n_classes = 10, 
                                # batch_size = 256, steps = 1400, learning_rate = 0.01, 
                                # optimizer = 'Adagrad')

# from skflow import TensorFlowLinearRegressor
# clf = TensorFlowLinearRegressor(n_classes = 10, 
                                # batch_size = 256, steps = 1400, learning_rate = 0.01, 
                                # optimizer = 'Adagrad')

from skflow import TensorFlowDNNClassifier
clf = TensorFlowDNNClassifier(hidden_units = [100, 200, 200, 200, 100],
                              n_classes = 10, batch_size = 256, steps = 1000, learning_rate = 0.01, 
                              optimizer = 'Adagrad')

# Train, Extrapolate
We are now ready to fit the classifier to the training data, predict/extrapolate to the test data, and save the results.

In [None]:
# Fit training data

start = clock()
clf.fit(train, label)
print("Fitted training data in {:.0f} seconds.".format(clock() - start))

# Extrapolate to test data

start = clock()
predict = clf.predict(test)
print("Extrapolated to test data in {:.0f} seconds.".format(clock() - start))

# Save results

test_frame['ImageId'] = range(1,len(test)+1)
test_frame['Label'] = predict
test_frame.to_csv('predict.csv', cols = ('ImageId', 'Label'), index = None)