In [637]:
import numpy as np
import pdb
import math

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.utils import shuffle

## 0. A neural network class (feedforward, fully connected)

Architectures are configurable. However, it only supports Stochastic Gradient Descent training. 

In [113]:
# sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [99]:
def sigmoid_deriv(x):
    return x*(1 - x)

In [114]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [209]:
def mse(y_pred, y_true):
    return np.sum((y_pred - y_true)**2)

In [239]:
math.inf


inf

In [435]:
a = [1,2,3]

In [436]:
a[0:-1]

[1, 2]

In [675]:
np.power(2,2)

4

In [None]:
class NNet(object):
    
    def __init__(self, input_dim, layers_dim, gamma = .9):
        # store classifier metadata
        self.layers_dim = layers_dim
        self.input_dim = input_dim
        self.gamma = gamma
        # keep track of errors - to make sure they're goig down
        self.errors = [math.inf]
        self.predictions = []
        """
        Initialize weights
        """
        self.weights = []
        # get number of neurons in first hidden layer
        k = layers_dim[0]
        # initialize first weight matrix
        W1 = np.random.random((k, input_dim))
        self.weights.append(W1)
        # Add the rest of the dimensions
        for i in range(len(layers_dim) - 1):
            # previous and post dimension
            prev_dim, next_dim = layers_dim[i], layers_dim[i+1]
            self.weights.append(np.random.random((next_dim, prev_dim)))
        
        """
        
        """
        self.weight_change = []
        for W in self.weights:
            self.weight_change.append(np.zeros(W.shape))
        
        """
        initialize the biases
        Only the hidden layers get biases
        """
        self.biases = {}
        hidden_dimensions = layers_dim[:-1]
        for l in range(len(hidden_dimensions)):
            self.biases[l + 1] = np.random.random(hidden_dimensions[l])
        """
        Keep track of the partial derivatives with regards to each weight. 
        We will build them up using batch gradient descent
        """
        self.derivatives = []
        for weight in self.weights:
            self.derivatives.append(np.zeros(weight.shape))
        
        """
        Keep track of the activations. 
        """
        self.activations = []
        """
        Keep track of the errors (delta)
        """
        self.deltas = []
    
    def _forward_prop(self, x):
        # set the first activation to be the data point itself
        self.activations.append(x)
        # update the rest of the activations
        for l in range(len(self.weights)):
            # Get the pre-combination of the next layer
            z_plus1 = np.dot(self.weights[l], self.activations[l])
            """
            Apply activation.
            If the layer is a hidden layer, add the biases. 
            """
            if l+1 in self.biases:
                a_plus1 = sigmoid(z_plus1 + self.biases[l+1])
            else:
                a_plus1 = sigmoid(z_plus1)
            # add to list of activations
            self.activations.append(a_plus1)
    
    def _back_prop(self, y):
        
        # compute the prediction error - difference between prediction and truth
        e = self.activations[-1] - y # vector subtraction
        self.deltas = [e]
        # compute the rest of the errors
        for l in reversed(range(1,len(self.weights))):
            # get the next error
            delta_plus1 = self.deltas[0]
            # get the current weight and activatios
            W_l = self.weights[l]
            a_l = self.activations[l]
            # compute the current derivative with respect to the activation (sigmoid)
            g_l = sigmoid_deriv(a_l)
            # compute the current error
            delta_l = np.multiply(np.dot(W_l.T, delta_plus1),g_l)
            # add the error to the front of the list
            self.deltas.insert(0, delta_l)
        """
        Update the partial derivatives of the weights
        """
        for k in range(len(self.weights)):
            self.derivatives[k] += np.outer(self.deltas[k], self.activations[k].T)
    
    def train(self,X, y, eta = .001, epochs = 10, print_every = 10, scaling = 1.0):
        """
        response y is expected to be in one-hot encoding. 
        convert it to be as such
        """
        y_onehot = np.zeros((X.shape[0], self.layers_dim[-1]))
        for i in range(len(y)):
            tmp = np.zeros(self.layers_dim[-1])
            tmp[int(y[i])] = 1
            y_onehot[i] = tmp
        
        """
        Store the number of data points
        """
        m = X.shape[0]
        """
        perform stochastic gradient descent
        """
        for e in range(epochs):
            # reset the predictions from previous epoch
            self._reset_predictions()
            if e % print_every == 0:
                print("Epoch %d: MSE = %f" %(e, self.errors[-1]))
            """
            SGD: Update for every training example
            """
            for i in range(X.shape[0]):
                # current example and respunse
                x, _y = X[i], y_onehot[i]
                
                # reset previous acivations and errors
                self._reset_activations()
                self._reset_deltas()
                self._reset_derivatives()
                # forward propogate
                self._forward_prop(x)
                # backwards propogate
                self._back_prop(_y)
                # update the weights using the derivatives
                for l in range(len(self.weights)):
                    if gamma < 1:
                        # Calculate the change in the weights that needs to be made
                        weight_change = self.gamma*self.weight_change[l] + eta*self.derivatives[l]
                        # update the weights
                        self.weights[l] -= weight_change
                        # store the weight change for next time
                        self.weight_change[l] = weight_change
                    else:
                        self.weights[l] -= eta*self.derivatives[l]
                    
                # update the biases using the errors
                for k in self.biases:
                    self.biases[k] -= eta*self.deltas[k - 1]
                
                # add the prediction of that example to the list
                self.predictions.append(np.argmax(self.activations[-1]))
            
            """
            End of epoch. 
            Calculate error, and store it.
            
            Also rescale learning rate
            """
            # current prediction
            pred = self.predictions
            self.errors.append(mse(pred, y))
            # rescale learning rate
            eta = eta/(np.power(e + 1,scaling))
    
    def predict(self, X_new):
        # store the predictions in a list for now
        predictions = []
        # for each of the training examples, forward propogate
        for x in X_new:
            self._reset_activations()
            self._forward_prop(x)
            # add the prediction to the predictions list
            predictions.append(np.argmax(self.activations[-1]))
            
        return(np.array(predictions))
        
                    
    def _reset_activations(self):
        self.activations = []
    
    def _reset_deltas(self):
        self.deltas = []
    
    def _reset_predictions(self):
        self.predictions = []
    
    def _reset_derivatives(self):
        self.derivatives = []
        for weight in self.weights:
            self.derivatives.append(np.zeros(weight.shape))

## 1. A dummy example - model validation

I'll apply the network on the classic Iris dataset to verify that it works. 

In [658]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [631]:
X.shape

(150, 4)

In [659]:
# num classes in y
np.unique(y)

array([0, 1, 2])

In [662]:
net = NNet(input_dim = 4, layers_dim = (20, 3))

In [663]:
net.train(X,y, epochs= 1000, print_every=100, eta = .001)

Epoch 0: MSE = inf
Epoch 100: MSE = 11.000000
Epoch 200: MSE = 6.000000
Epoch 300: MSE = 9.000000
Epoch 400: MSE = 11.000000
Epoch 500: MSE = 11.000000
Epoch 600: MSE = 8.000000
Epoch 700: MSE = 9.000000
Epoch 800: MSE = 8.000000
Epoch 900: MSE = 7.000000


In [664]:
accuracy_score(net.predict(X), y)

0.96

### IT WORKS :) :) 

## 1. Load Data - Circle preprocessing

Recall - there are four pre-processing schemes. Based on the logistic regression results, the dataset with the "circle heuristic" works best. I'll only use this dataset. 

In [665]:
X = np.load("../data/preproccessed/circle/X_trainnorm.npy")
y = np.load("../data/preproccessed/circle/y_trainnorm.npy")

In [640]:
X.shape, y.shape

((50000, 28, 28), (50000, 1))

We'll have to unroll the data:

In [666]:
X = X.reshape(50000, 28*28)
y = y.reshape(50000,)

#### Train/validation splits

In [667]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=1)

## 3. A first architecture

Here, we'll have one hidden layer of 64 nodes.

In [691]:
net1 = NNet(input_dim = 28*28, layers_dim=(128,10), gamma = .99)

Train!

In [692]:
net1.train(X_train, y_train, print_every=1, eta = .01, epochs=100, scaling=.5)

Epoch 0: MSE = inf
Epoch 1: MSE = 655114.000000
Epoch 2: MSE = 663528.000000
Epoch 3: MSE = 662018.000000
Epoch 4: MSE = 659532.000000
Epoch 5: MSE = 666435.000000
Epoch 6: MSE = 665328.000000
Epoch 7: MSE = 668678.000000
Epoch 8: MSE = 664328.000000


KeyboardInterrupt: 

Too slow... I'll train on AWS so that I can at least turn of my computer..