In [None]:
import numpy as np
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.contrib.layers import xavier_initializer

%matplotlib inline
import matplotlib.pyplot as plt

import shutil
import os
import time

from sklearn.model_selection import train_test_split

In [None]:
train_data = np.load('datasets/data_classifier_train.npz')
test_data = np.load('datasets/data_classifier_test.npz')

In [None]:
X_train = np.asarray(train_data['x_train'], dtype=np.float32) / 255
y_train = np.asarray(train_data['y_train'], dtype=np.int32)
X_train, y_train = shuffle(X_train, y_train, random_state=0)

X_test = np.asarray(test_data['x_test'], dtype=np.float32) / 255
y_test = np.asarray(test_data['y_test'], dtype=np.int32)
X_test, y_test = shuffle(X_test, y_test, random_state=1)
NUM_LABELS = len(np.unique(y_train))

In [None]:
def cnn_model_fn(features, labels, mode, params):
    if params == None:
        learning_rate = 0.001
        momentum = 0.9
    else:
        learning_rate = params['learning_rate']
        momentum = params['momentum']
    input_layer = tf.reshape(features['x'], [-1, 28, 28, 1])
    conv_layers = [
        [32, 3, 1],
        [32, 5, 2],
        [64, 3, 1],
        [64, 5, 2]
    ]
    curr_layer = input_layer
    for i, [f, k, s] in enumerate(conv_layers, start=1):
        curr_layer = tf.layers.conv2d(
            inputs=curr_layer,
            filters=f,
            kernel_size=k,
            strides=s,
            padding='same',
            activation=tf.nn.relu,
            kernel_initializer=xavier_initializer(seed=i),
            name=f'conv{i}'
        )
    conv_flat = tf.reshape(curr_layer, [-1, np.prod(curr_layer.shape[1:])])
    dense = tf.layers.dense(
        inputs=conv_flat,
        units=1024,
        activation=tf.nn.relu
    )
    logits = tf.layers.dense(
        inputs=dense,
        units=NUM_LABELS
    )
    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    if mode == tf.estimator.ModeKeys.TRAIN:
        # ================== my momentum optimizer ===============
        xs = tf.trainable_variables()
        prev_deltas = [
            tf.Variable(initial_value=tf.zeros(shape=x.shape, dtype=x.dtype), trainable=False)
            for x in xs
        ]
        grads = tf.gradients(loss, xs)
        train_ops = []
        for x, grad, prev_delta in zip(xs, grads, prev_deltas):
            curr_delta = momentum * prev_delta - learning_rate * grad
            train_ops.append(tf.assign_add(x, curr_delta))
            train_ops.append(tf.assign(prev_delta, curr_delta))
        global_step = tf.train.get_global_step()
        train_ops.append(tf.assign_add(global_step, tf.constant(1, dtype=global_step.dtype)))
        train_op = tf.group(train_ops)
        # ==============================================================
        # tensorflow momentum optimizer
#         optimizer = tf.train.MomentumOptimizer(momentum=momentum, learning_rate=learning_rate)
#         train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(
            labels=labels,
            predictions=predictions['classes'])
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [None]:
def train_eval_cnn(X_train, y_train, X_eval, y_eval,
                   learning_rate, momentum, batch_size=64, steps=10000):
    model_dir = f'cnn_model_{momentum}_{learning_rate}'
    # remove old models
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)
    params = {
        'learning_rate': learning_rate,
        'momentum': momentum
    }
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir,
        params=params)
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': X_train},
        y=y_train,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=False)
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': X_eval},
        y=y_eval,
        num_epochs=1,
        shuffle=False)
    trainset_eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': X_train},
        y=y_train,
        num_epochs=1,
        shuffle=False)
    start_ts = time.time()
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=steps)
    end_ts = time.time()
    trainset_score = mnist_classifier.evaluate(input_fn=trainset_eval_input_fn)
    eval_score = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(f'Training time in seconds: {end_ts - start_ts}')
    print('Performance on training set: ')
    print(trainset_score)
    print('Performance on validation/test set: ')
    print(eval_score)
    return mnist_classifier, eval_score['accuracy']

In [None]:
def cnn_cv(X, y, n_splits, momentums, learning_rates):
    for momentum in momentums:
        for learning_rate in learning_rates:
            accuracy = []
            for _ in range(0, n_splits):
                X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2)
                clf, accu = train_eval_cnn(X_train, y_train, X_eval, y_eval, learning_rate, momentum)
                accuracy.append(accu)
            print(f'Momentum = {momentum}, Learning rate = {learning_rate}, Average accuracy: {np.mean(accuracy)}')

In [None]:
# use cross validation to find optimal momentum
momentums = np.arange(0.5, 1, 0.1)
learning_rates = [0.001]
cnn_cv(X_train, y_train, 5, momentums, learning_rates)

In [None]:
# use cross validation to find optimal learning rate
momentums = [0.9]
learning_rates = [0.0001, 0.001, 0.01, 0.1, 1]
cnn_cv(X_train, y_train, 5, momentums, learning_rates)

In [None]:
# train final model using the optimal momentum and learning rate
# the accuracy and loss on both the training and test sets are printed
train_eval_cnn(X_train, y_train, X_test, y_test, learning_rate=0.01, momentum=0.9)