# Implementation du reseau de neurones et experimentation

In [7]:
import numpy as np
from sklearn import datasets

## Calcul numériquement stable du softmax

In [8]:
def softmax_vector(x):
    '''
    x:  is a data vector.
    returns: the result of the softmax function applied to the data vector.
    '''
    max_comp = np.amax(x)
    normalized  = x - max_comp
    
    exponential = np.exp(normalized)
    
    return exponential/np.sum(exponential)

In [27]:
def softmax(X):
    '''
    X: matrix that holds the data, every row is a data vector.
    returns: matrix where every row is the result of the softmax function applied to the corresponding data vector.
    '''
    
    max_comp = np.amax(X, axis=1)
    normalized  = X - max_comp.reshape(X.shape[0], 1)
    
    exponential = np.exp(normalized)
    
    return exponential/np.sum(exponential, axis=1).reshape(X.shape[0], 1)

Fonction utilitaire pour calculer relu($x$)

In [45]:
def relu(x):
    return np.maximum(x, np.zeros(x.shape))

def onehot(m, y):
    return np.eye(m)[y]

def onehot_matrix(m, targets):
    """
    Returns: onehot matrix where every column is a onehot vector of the coressponding target
    """
    eye = np.eye(m)
    onehot_matrix = np.zeros((m,len(targets)))
    
    for i, y in enumerate(targets):
        onehot_matrix[:,i] = eye[y]
        
    return onehot_matrix

## Question 1 et quesiton 2

Implementation de fprop et bprop pour calculer le gradient sur un exemple

In [38]:
class NeuralNet:
    
    def __init__(self, n_input, n_hidden, n_out):
        
        self.n_in = n_input
        self.n_h = n_hidden
        self.n_o = n_out
        
        low_bound = -1 / np.sqrt([self.n_in, self.n_h])
        up_bound = 1 / np.sqrt([self.n_in, self.n_h])
        
        # Initialize the parameters
        self.W1 = np.random.uniform(low_bound[0], up_bound[0], size=(self.n_h, self.n_in))  # d_h x d
        self.W2 = np.random.uniform(low_bound[1], up_bound[1], size=(self.n_o, self.n_h))  # m x d_h
        self.b1 = np.zeros(self.n_h)  # dimension d_h
        self.b2 = np.zeros(self.n_o) # dimension m
    
    def fprop(self, x):
        '''Computes activations for every layer'''
        self.ha = self.W1.dot(x) + self.b1
        self.hs = relu(ha)
        self.oa = self.W2.dot(hs) + self.b2
        self.os = softmax_vector(oa)
            
    def bprop(self, x, y):
        '''Computes the gradients, must be executed after fprop'''
                      
        grad_oa = os - onehot(self.n_out, y)
        grad_b2 = grad_oa
        grad_W2 = np.outer(grad_oa, self.hs)
        grad_hs = self.W2.T.dot(grad_oa)
        grad_ha = grad_hs * (self.ha > 0)
        grad_W1 = np.outer(grad_ha, x)
        grad_b1 = grad_ha
        
        return grad_W1, grad_W2, grad_b1, grad_b2

Fonction pour calculer les differences finies

In [2]:
def compute_loss(x, y, W1, W2, b1, b2):
    ha = W1.dot(x) + b1
    hs = relu(ha)
    oa = W2.dot(hs) + b2
    os = softmax_vector(oa)
    
    return -np.log(os[y])

In [40]:
def finite_diff(x, neural_net, eps=1e-5):
    
    # params
    #W1 = neural_net.W1
    #w2 = neural_net.W2
    #b1 = neural_net.b1
    #b2 = neural_net.b2
    
    #neural_net.fprop(x)
    #loss = -np.log(neural_net.os)
    
    #W1 = W1 - eps
    #W2 = W2 - eps
    #b1 = b1 - eps
    #b2 = b2 - eps
    
    #new_loss = (x, W1, W2, b1, b2)
    
    #W1 = W1 + eps
    #W2 = W2 + eps
    #b1 = b1 + eps
    #b2 = b2 + eps
    
    #return (new_loss - loss)/eps

In [41]:
data = np.loadtxt(open('2moons.txt','r'))

In [42]:
nn = NeuralNet(2, 2, 2)

## Question 3 et 4

In [44]:
class NeuralNet:
    
    def __init__(self, n_input, n_hidden, n_out, lambdas):
        
        self.n_in = n_input
        self.n_h = n_hidden
        self.n_o = n_out
        self.lambdas = lambdas
        
        low_bound = -1 / np.sqrt([self.n_in, self.n_h])
        up_bound = 1 / np.sqrt([self.n_in, self.n_h])
        
        # Initialize the parameters
        self.W1 = np.random.uniform(low_bound[0], up_bound[0], size=(self.n_h, self.n_in))  # d_h x d
        self.W2 = np.random.uniform(low_bound[1], up_bound[1], size=(self.n_o, self.n_h))  # m x d_h
        self.b1 = np.zeros(self.n_h)  # dimension d_h
        self.b2 = np.zeros(self.n_o) # dimension m
    
    def fprop(self, x):
        '''Computes activations for every layer'''
        self.ha = self.W1.dot(x) + self.b1
        self.hs = relu(ha)
        self.oa = self.W2.dot(hs) + self.b2
        self.os = softmax_vector(oa)
            
    def bprop(self, x, y):
        '''Computes the gradients, must be executed after fprop'''
                      
        grad_oa = os - onehot(self.n_out, y)
        grad_b2 = grad_oa
        grad_W2 = np.outer(grad_oa, self.hs)
        grad_hs = self.W2.T.dot(grad_oa)
        grad_ha = grad_hs * (self.ha > 0)
        grad_W1 = np.outer(grad_ha, x)
        grad_b1 = grad_ha
        
        return grad_W1, grad_W2, grad_b1, grad_b2
    
    def compute_loss(self, y):
        return -np.log(self.os[y])
    
    def train(self, train_data, max_iter, batch_size, eta=0.05):
        
        n_batches = np.ceil(train_data.shape[0]/batch_size) # number of batches
        
        # Initialize batch start and end indices
        batch_start = 0
        if (batch_start + batch_size < train_data.shape[0]):
            batch_end = batch_start + batch_size
        else:
            batch_end = train_data.shape[0]
        
        for i in range(max_iter):
            for j in range(n_batches):
                
                batch = train_data[batch_start:batch_end]
                grad_W1_mean = np.zeros((self.n_h, self.n_in))
                grad_W2_mean = np.zeros((self.n_o, self.n_h))
                grad_b1_mean = np.zeros(self.n_h)
                grad_b2_mean = np.zeros(self.n_o)
                
                for elem in batch:
                    fprop(elem[-1])                    
                    losses.push(compute_loss(elem[-1]))
                    
                    grad_W1, grad_W2, grad_b1, grad_b2 = bprop(elem[:-1], elem[-1])
                    
                    grad_w1_mean = grad_W1_mean + grad_W1
                    grad_w2_mean = grad_W2_mean + grad_W2
                    grad_b1_mean = grad_b1_mean + grad_b1
                    grad_b2_mean = grad_b2_mean + grad_b2
                
                # Compute mean of loss/cost over all examples in the batch
                n = len(batch)
                grad_w1_mean = grad_W1_mean / n
                grad_w2_mean = grad_W2_mean / n
                grad_b1_mean = grad_b1_mean / n
                grad_b2_mean = grad_b2_mean / n
                
                #regularization
                penality_grad_W1 = self.lambdas[0][0] * np.sign(self.W1) + 2 * self.lambdas[0][1] * self.W1
                penality_grad_W2 = self.lambdas[1][0] * np.sign(self.W2) + 2 * self.lambdas[1][1] * self.W2
                
                self.W1 = self.W1 - eta * (grad_W1_mean + penality_grad_W1)
                self.W2 = self.W2 - eta * (grad_W2_mean + penality_grad_W2)
                self.b1 = self.b1 - eta * grad_b1_mean
                self.b2 = self.b2 - eta * grad_b2_mean
                
                # Get next batch
                batch_start = batch_end + 1        
                if (batch_start + batch_size < train_data.shape[0]):
                    batch_end = batch_start + batch_size
                else:
                    batch_end = train_data.shape[0]

## Question 5: Entrainement sur 2 moons et visualisation des regions de decision

## Question 6: Optimisation du calcule de gradient pour mini-batch

$\mathbf{W}^{(1)} \in \mathbb{R}^{d_h \times d}$, $\mathbf{X} \in \mathbb{R}^{n \times d}$ et $\mathbf{B}^{(1)} \in \mathbb{R}^{d_h \times n}$

$$\mathbf{h}^{a} = \mathbf{W}^{(1)}\mathbf{X}^{\top} + \mathbf{B}^{(1)} \in \mathbb{R}^{d_h \times n}$$


In [1]:
class NeuralNetVectorized:
    
    def __init__(self, n_input, n_hidden, n_out, lambdas):
        
        self.n_in = n_input
        self.n_h = n_hidden
        self.n_o = n_out
        self.lambdas = lambdas
        
        low_bound = -1 / np.sqrt([self.n_in, self.n_h])
        up_bound = 1 / np.sqrt([self.n_in, self.n_h])
        
        # Initialize the parameters
        self.W1 = np.random.uniform(low_bound[0], up_bound[0], size=(self.n_h, self.n_in))  # d_h x d
        self.W2 = np.random.uniform(low_bound[1], up_bound[1], size=(self.n_o, self.n_h))  # m x d_h
        self.b1 = np.zeros(self.n_h)  # dimension d_h
        self.b2 = np.zeros(self.n_o) # dimension m
    
    def fprop(self, X):
        '''
        Computes activations for every layer
        X: input data set
        '''
        self.ha = self.W1.dot(X.T) + self.b1.reshape(self.n_h, 1)
        self.hs = relu(ha)
        self.oa = self.W2.dot(hs) + self.b2.reshape(self.n_o, 1)
        self.os = softmax(oa)
            
    def bprop(self, X, Y):
        '''
        Computes the gradients, must be executed after fprop
        X: Input data set
        Y: targets
        '''
                      
        grad_oa = os - onehot_matrix(self.n_out, Y)
        grad_b2 = grad_oa # m x n
        grad_W2 = np.dot(grad_oa, self.hs.T) # sum of gradients grad_W2 for each example
        grad_hs = self.W2.T.dot(grad_oa) # d_h x n
        grad_ha = grad_hs * (self.ha > 0) # d_h x n
        grad_W1 = np.dot(grad_ha, X.T) # sum of gradients grad_W1 for each example
        grad_b1 = grad_ha # d_h x n
        
        return grad_W1, grad_W2, grad_b1, grad_b2
    
    def compute_loss(self, y):
        # TODO: adjust for y vector
        return -np.log(self.os)
    
    def train(self, train_data, max_iter, batch_size, eta=0.05):
        
        n_batches = np.ceil(train_data.shape[0]/batch_size) # number of batches
        
        # Initialize batch start and end indices
        batch_start = 0
        if (batch_start + batch_size < train_data.shape[0]):
            batch_end = batch_start + batch_size
        else:
            batch_end = train_data.shape[0]
        
        for i in range(max_iter):
            for j in range(n_batches):
                
                batch = train_data[batch_start:batch_end]
                
                fprop(batch[:,:-1])
                grad_W1, grad_W2, grad_b1, grad_b2 = bprop(batch[:,:-1], batch[:,-1]) 
                
                n = len(batch)
                
                #regularization
                penality_grad_W1 = self.lambdas[0][0] * np.sign(self.W1) + 2 * self.lambdas[0][1] * self.W1
                penality_grad_W2 = self.lambdas[1][0] * np.sign(self.W2) + 2 * self.lambdas[1][1] * self.W2
                
                self.W1 = self.W1 - eta * ((grad_W1 / n) + penality_grad_W1)
                self.W2 = self.W2 - eta * ((grad_W2 / n) + penality_grad_W2)
                self.b1 = self.b1 - eta * np.mean(grad_b1, axis=1)
                self.b2 = self.b2 - eta * np.mean(grad_b2, axis=1)
                
                
                # Get next batch
                batch_start = batch_end + 1        
                if (batch_start + batch_size < train_data.shape[0]):
                    batch_end = batch_start + batch_size
                else:
                    batch_end = train_data.shape[0]