### Used Language : Python3

## Downloading MNIST

Reference:  
https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py  
https://github.com/oreilly-japan/deep-learning-from-scratch/blob/master/dataset/mnist.py 

In [1]:
import numpy as np
import os
import urllib.request
import gzip

img_size = 784

def load_dataset():

    def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
        print("Downloading %s" % filename)
        urllib.request.urlretrieve(source + filename, filename)

    def load_mnist_images(filename):
        if not os.path.exists(filename):
            download(filename)
        
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        
        data = data.reshape(-1, img_size)
        
        return data / np.float32(256)

    def load_mnist_labels(filename):
        if not os.path.exists(filename):
            download(filename)
        # Read the labels in Yann LeCun's binary format.
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=8)
            
        return data
    
    def _change_one_hot_label(X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        
        return T

    
    X_train = load_mnist_images('train-images-idx3-ubyte.gz')
    y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
    X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
    y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
    
    y_train = _change_one_hot_label(y_train)
    y_test =  _change_one_hot_label(y_test)

    X_train, X_val = X_train[:-10000], X_train[-10000:]
    y_train, y_val = y_train[:-10000], y_train[-10000:]

    return X_train, y_train, X_val, y_val, X_test, y_test


In [2]:
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

Downloading train-images-idx3-ubyte.gz
Downloading train-labels-idx1-ubyte.gz
Downloading t10k-images-idx3-ubyte.gz
Downloading t10k-labels-idx1-ubyte.gz


## Implementing 3 layer Neural Network  
Reference :   
https://github.com/oreilly-japan/deep-learning-from-scratch/blob/master/ch04/two_layer_net.py  
https://github.com/oreilly-japan/deep-learning-from-scratch/blob/master/ch04/train_neuralnet.py  


In [3]:
# クラス内で使用する関数の定義
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def sigmoid_diff(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)

def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 
    
    x = x - np.max(x) 
    return np.exp(x) / np.sum(np.exp(x))
    
def cross_entropy_error(y, t):
    if y.ndim == 1:   
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

class Three_Layer_NN:
    
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, std=0.01):
        
        self.params = {}
        self.params['W1'] = std * np.random.randn(input_size, hidden_1_size)
        self.params['b1'] = np.zeros(hidden_1_size)
        self.params['W2'] = std * np.random.randn(hidden_1_size, hidden_2_size)
        self.params['b2'] = np.zeros(hidden_2_size)
        self.params['W3'] = std * np.random.randn(hidden_2_size, output_size)
        self.params['b3'] = np.zeros(output_size)
    
    
    def loss(self, X, t=None):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        
        
        # forward
        U1 = np.dot(X, W1) + b1    
        Z1 = sigmoid(U1)           
        U2 = np.dot(Z1, W2) + b2
        Z2 = sigmoid(U2)
        U3 = np.dot(Z2, W3) + b3
        y = softmax(U3)     
                
        if t is None:
            return y
        
        # loss function
        loss = cross_entropy_error(y, t)    
        
        # back propagation
        grads = {}
        batch_num = X.shape[0]
        
        delta_3 = (y - t) / batch_num        
        grads['W3'] = np.dot(Z2.T, delta_3)
        grads['b3'] = np.sum(delta_3, axis=0)
        
        delta_2 = (np.dot(delta_3, W3.T)) * sigmoid_diff(U2)
        grads['W2'] = np.dot(Z1.T, delta_2)
        grads['b2'] = np.sum(delta_2, axis=0)
        
        delta_1 = (np.dot(delta_2, W2.T)) * sigmoid_diff(U1)
        grads['W1'] = np.dot(X.T, delta_1)
        grads['b1'] = np.sum(delta_1, axis=0)
        
        return loss, grads
    
    
    def train(self, X, t, X_val, y_val, learning_rate=0.1, num_iters=10000, batch_size=100):
        
        iter_per_epoch = max(X.shape[0] / batch_size, 1)
        
        for i in range(num_iters):
            batch = np.random.choice(X.shape[0], batch_size)
            X_batch = X[batch]
            t_batch = t[batch]
            
            loss, grads = self.loss(X_batch, t_batch)
            
            for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3'):
                self.params[key] -= learning_rate * grads[key] 
            
            if i % iter_per_epoch == 0:
                train_acc = self.accuracy(X, t)
                val_acc = self.accuracy(X_val, y_val)
                print("train acc, val acc | " + str(train_acc) + ", " + str(val_acc))
                
        print("Finish!")
            
    
    def predict(self, X):    
        return self.loss(X)
    
    def accuracy(self, X, t):
        y = self.predict(X)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(X.shape[0])
        return accuracy
        

## Feeding the data and Checking the accuracy 

In [5]:
# Learnig rate : 0.1
# Iteration : 10000
# Batch size : 100
network = Three_Layer_NN(input_size=784, hidden_1_size=255, hidden_2_size=75, 
                         output_size=10, std=0.01)

network.train(X_train, y_train, X_val, y_val, learning_rate=0.1, num_iters=10000, batch_size=100)

train acc, val acc | 0.09864, 0.0991
train acc, val acc | 0.11356, 0.1064
train acc, val acc | 0.10202, 0.103
train acc, val acc | 0.09976, 0.0961
train acc, val acc | 0.11356, 0.1064
train acc, val acc | 0.20476, 0.1989
train acc, val acc | 0.31916, 0.3199
train acc, val acc | 0.48556, 0.4974
train acc, val acc | 0.62078, 0.6502
train acc, val acc | 0.73884, 0.7664
train acc, val acc | 0.78704, 0.8054
train acc, val acc | 0.81898, 0.8339
train acc, val acc | 0.84254, 0.8523
train acc, val acc | 0.85814, 0.8669
train acc, val acc | 0.86812, 0.8756
train acc, val acc | 0.8738, 0.8787
train acc, val acc | 0.88472, 0.8903
train acc, val acc | 0.89238, 0.8968
train acc, val acc | 0.89856, 0.904
train acc, val acc | 0.90402, 0.9073
Finish!


In [6]:
network.accuracy(X_test, y_test)

0.90869999999999995

In [7]:
# Learnig rate : 1.1
# Iteration : 5000
# Batch size : 100
network = Three_Layer_NN(input_size=784, hidden_1_size=255, hidden_2_size=75, 
                         output_size=10, std=0.01)

network.train(X_train, y_train, X_val, y_val, learning_rate=1.1, num_iters=5000, batch_size=100)

train acc, val acc | 0.09684, 0.1009
train acc, val acc | 0.39732, 0.405
train acc, val acc | 0.82864, 0.8452
train acc, val acc | 0.91614, 0.9182
train acc, val acc | 0.9413, 0.9464
train acc, val acc | 0.95174, 0.9565
train acc, val acc | 0.96304, 0.9614
train acc, val acc | 0.9688, 0.9652
train acc, val acc | 0.96764, 0.9642
train acc, val acc | 0.97062, 0.9639
Finish!


In [8]:
network.accuracy(X_test, y_test)

0.96909999999999996

## Option 