# Dropout
Reference :
http://cs231n.github.io/neural-networks-2/  
  

In [1]:
import numpy as np
import os
import urllib.request
import gzip

In [2]:

img_size = 784

def load_dataset():

    def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
        print("Downloading %s" % filename)
        urllib.request.urlretrieve(source + filename, filename)

    def load_mnist_images(filename):
        if not os.path.exists(filename):
            download(filename)
        
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        
        data = data.reshape(-1, img_size)
        
        return data / np.float32(256)

    def load_mnist_labels(filename):
        if not os.path.exists(filename):
            download(filename)
        
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=8)
            
        return data
    
    # change labels into one hot vectors
    def _change_one_hot_label(X):
        T = np.zeros((X.size, 10))
        for idx, row in enumerate(T):
            row[X[idx]] = 1
        
        return T

    
    X_train = load_mnist_images('train-images-idx3-ubyte.gz')
    y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
    X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
    y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
    
    y_train = _change_one_hot_label(y_train)
    y_test =  _change_one_hot_label(y_test)

    X_train, X_val = X_train[:-10000], X_train[-10000:]
    y_train, y_val = y_train[:-10000], y_train[-10000:]

    return X_train, y_train, X_val, y_val, X_test, y_test



In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
X = np.concatenate([X_train, X_val], axis=0)
y = np.concatenate([y_train, y_val], axis=0)

In [5]:
print(X.shape)
print(y.shape)

(60000, 784)
(60000, 10)


In [4]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 
    
    x = x - np.max(x) 
    return np.exp(x) / np.sum(np.exp(x))
    
def cross_entropy_error(y, t):
    if y.ndim == 1:   
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size


class Three_Layer_NN_dropout:
    
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, std=0.01):
        
        self.params = {}
        self.params['W1'] = std * np.random.randn(input_size, hidden_1_size)
        self.params['b1'] = np.zeros(hidden_1_size)
        self.params['W2'] = std * np.random.randn(hidden_1_size, hidden_2_size)
        self.params['b2'] = np.zeros(hidden_2_size)
        self.params['W3'] = std * np.random.randn(hidden_2_size, output_size)
        self.params['b3'] = np.zeros(output_size)
    
    
    def loss(self, X, t=None):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        
        p = 0.5
        
        # forward
        H1 = np.dot(X, W1) + b1    
        H1 = np.maximum(0, H1)    # ReLu
        U1 = (np.random.rand(*H1.shape) < p) / p    
        H1 *= U1                                    
        
        H2 = np.dot(H1, W2) + b2
        H2 = np.maximum(0, H2)    # ReLu
        U2 = (np.random.rand(*H2.shape) < p) / p
        H2 *= U2                                    
        
        H3 = np.dot(H2, W3) + b3
        y = softmax(H3)     
                
        if t is None:
            return y
        
        # loss function
        loss = cross_entropy_error(y, t)    
        
        # back propagation
        grads = {}
        batch_num = X.shape[0]
        
        delta_3 = (y - t) / batch_num        
        grads['W3'] = np.dot(H2.T, delta_3)
        grads['b3'] = np.sum(delta_3, axis=0)
        
        relu_diff_2 = (H2 > 0) * np.ones([*H2.shape])
        delta_2 = (np.dot(delta_3, W3.T)) * relu_diff_2
        grads['W2'] = np.dot(H1.T, delta_2)
        grads['b2'] = np.sum(delta_2, axis=0)
        
        relu_diff_1 = (H1 > 0) * np.ones([*H1.shape])
        delta_1 = (np.dot(delta_2, W2.T)) * relu_diff_1
        grads['W1'] = np.dot(X.T, delta_1)
        grads['b1'] = np.sum(delta_1, axis=0)
        
        return loss, grads
    
    # Training by SGD with dropout
    def train(self, X, t, X_val, y_val, learning_rate=0.1, num_iters=10000, batch_size=100):
        
        train_acc_list = []
        val_acc_list = []
        
        iter_per_epoch = 500
        
        for i in range(num_iters):
            batch = np.random.choice(X.shape[0], batch_size)
            X_batch = X[batch]
            t_batch = t[batch]
            
            loss, grads = self.loss(X_batch, t_batch)
            
            for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3'):
                self.params[key] -= learning_rate * grads[key] 
            
            if i % iter_per_epoch == 0:
                train_acc = self.accuracy(X, t)
                val_acc = self.accuracy(X_val, y_val)
                train_acc_list.append(train_acc)
                val_acc_list.append(val_acc)
                print(i, ": train acc, val acc | " + str(train_acc) + ", " + str(val_acc))
                
        print("Finish!")
        return train_acc_list, val_acc_list
            
    
    def predict(self, X):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']
        
        H1 = np.dot(X, W1) + b1    
        H1 = np.maximum(0, H1)    # ReLu
        
        H2 = np.dot(H1, W2) + b2
        H2 = np.maximum(0, H2)    # ReLu
        
        H3 = np.dot(H2, W3) + b3
        y = softmax(H3)     

  
        return y
    
    def accuracy(self, X, t):
        y = self.predict(X)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(X.shape[0])
        return accuracy
        

In [6]:
network = Three_Layer_NN_dropout(input_size=784, hidden_1_size=500, hidden_2_size=255, 
                         output_size=10, std=0.01)

_, _ = network.train(X_train, y_train, X_val, y_val, learning_rate=0.4, num_iters=7000, batch_size=100)

0 : train acc, val acc | 0.09718, 0.0983
500 : train acc, val acc | 0.91838, 0.9245
1000 : train acc, val acc | 0.95238, 0.9543
1500 : train acc, val acc | 0.9655, 0.9644
2000 : train acc, val acc | 0.97116, 0.9668
2500 : train acc, val acc | 0.9775, 0.9717
3000 : train acc, val acc | 0.98032, 0.9728
3500 : train acc, val acc | 0.98312, 0.9759
4000 : train acc, val acc | 0.98558, 0.9782
4500 : train acc, val acc | 0.9868, 0.9786
5000 : train acc, val acc | 0.98868, 0.9781
5500 : train acc, val acc | 0.98908, 0.9787
6000 : train acc, val acc | 0.99098, 0.9801
6500 : train acc, val acc | 0.99172, 0.9798
Finish!


In [7]:
network.accuracy(X_test, y_test)

0.98089999999999999

# train by (train + val) 

In [8]:
network = Three_Layer_NN_dropout(input_size=784, hidden_1_size=500, hidden_2_size=255, 
                         output_size=10, std=0.01)

_, _ = network.train(X, y, X_val, y_val, learning_rate=0.4, num_iters=7000, batch_size=100)

0 : train acc, val acc | 0.112366666667, 0.1064
500 : train acc, val acc | 0.9014, 0.9108
1000 : train acc, val acc | 0.953166666667, 0.9584
1500 : train acc, val acc | 0.963783333333, 0.9689
2000 : train acc, val acc | 0.965483333333, 0.97
2500 : train acc, val acc | 0.973016666667, 0.9758
3000 : train acc, val acc | 0.97905, 0.9786
3500 : train acc, val acc | 0.9821, 0.9826
4000 : train acc, val acc | 0.983766666667, 0.9853
4500 : train acc, val acc | 0.98515, 0.9855
5000 : train acc, val acc | 0.986133333333, 0.9868
5500 : train acc, val acc | 0.987483333333, 0.989
6000 : train acc, val acc | 0.988883333333, 0.9888
6500 : train acc, val acc | 0.99, 0.9909
Finish!


In [9]:
network.accuracy(X_test, y_test)

0.98140000000000005