# Multi Layer Perceptron

In [1]:
import numpy as np

In [2]:
with open('data/heart.data') as inF:
    data = inF.readlines()


In [3]:
X = []
y = []
for line in data:
    line = line.split(',')
    
    features = line[:13]
    label = line[13]
    
    clean_features = []
    for x in features:
        try:
            x = float(x)
            clean_features.append(x)
        except:
            clean_features.append(0)
    assert len(clean_features) == len(features)
    X.append(clean_features)
    y.append(label)

X = np.array(X)

# 0 = No heart Disease, 1 = Heart Disease
y = np.array([0 if float(x) == float(0) else 1 for x in y])


X.shape, y.shape


((303, 13), (303,))

In [4]:
y

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

### Zero Based Rule

- Baseline Accuracy: always pick the most common class

### Gradient Descent Based Algorithms

Machine Learning Algorithms like: 
- Linear Regression
- Logistic Regression
- Neural Networks

use gradient descent as an optimization technique and require input data to be scaled

In [5]:
def scale(x, normal=True, min=0, max=0):
    """
        Normalizes a numpy array
        This can be useful in algorithms
        that do not assume any distribution 
        of the data like Neural Networks.
    """
    min= x.min()
    max = x.max()
    range = x.max() - x.min()
    scaled = (x - min) / range
    return scaled

X_prime = scale(X)

In [6]:
X_prime

array([[0.11170213, 0.00177305, 0.00177305, ..., 0.00531915, 0.        ,
        0.0106383 ],
       [0.11879433, 0.00177305, 0.0070922 , ..., 0.0035461 , 0.00531915,
        0.00531915],
       [0.11879433, 0.00177305, 0.0070922 , ..., 0.0035461 , 0.0035461 ,
        0.01241135],
       ...,
       [0.10106383, 0.00177305, 0.0070922 , ..., 0.0035461 , 0.00177305,
        0.01241135],
       [0.10106383, 0.        , 0.0035461 , ..., 0.0035461 , 0.00177305,
        0.00531915],
       [0.06737589, 0.00177305, 0.00531915, ..., 0.00177305, 0.        ,
        0.00531915]])

In [7]:
X_prime.shape

(303, 13)


The shape of W depends on whether or not X's columns are features or Examples.\
We dereive the deimensions for the Bias and Weight matrix based on the input dimensions and required output dimensions. 

`W` must have the same number as rows, as input (`X`) has features (columns) \
Matrix Addition doesn't change the shape of matrices, \
Therefore `b` must have the same dimensions as the output Matrix `y` 


In [8]:

# Multi Layer Perceptron
EXAMPLES = X.shape[0]
FEATURES = X.shape[1]
CLASSES = 1

NODES = 16
LAYERS = 1

INITWEIGHT = 1

In [9]:
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64)
X_xor

array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

In [10]:
biases = np.ones((4, 1), dtype=np.int64)
biases

array([[1],
       [1],
       [1],
       [1]])

In [11]:
X_xor = np.append(X_xor, biases, axis=1)

In [12]:
X_xor

array([[0, 0, 1],
       [0, 1, 1],
       [1, 0, 1],
       [1, 1, 1]])

In [13]:
weights = np.array([[.351, -.097, .457],[1.076, -.165, -.165], [1.16, .542, -.331]])

In [14]:
affine = np.matmul(X_xor, weights)

In [15]:
affine

array([[ 1.16 ,  0.542, -0.331],
       [ 2.236,  0.377, -0.496],
       [ 1.511,  0.445,  0.126],
       [ 2.587,  0.28 , -0.039]])

In [16]:
np.random.seed(5)
b1 = np.random.uniform(low=0, high=INITWEIGHT, size=NODES)
print(f"b Shape: {b1.shape}")
print(f"b:\n{b1}")

b Shape: (16,)
b:
[0.22199317 0.87073231 0.20671916 0.91861091 0.48841119 0.61174386
 0.76590786 0.51841799 0.2968005  0.18772123 0.08074127 0.7384403
 0.44130922 0.15830987 0.87993703 0.27408646]


Multiplying `X` and `W` must be equal to `b` 

In [17]:
np.random.seed(10)
W1 = np.random.uniform(low=0, high=INITWEIGHT, size=(FEATURES, NODES)).round(2) 
print(f"W shape: {W1.shape}")
print(f"W\n{W1}")

W shape: (13, 16)
W
[[0.77 0.02 0.63 0.75 0.5  0.22 0.2  0.76 0.17 0.09 0.69 0.95 0.   0.51
  0.81 0.61]
 [0.72 0.29 0.92 0.71 0.54 0.14 0.37 0.67 0.44 0.43 0.62 0.51 0.65 0.6
  0.81 0.52]
 [0.91 0.32 0.09 0.3  0.11 0.83 0.05 0.63 0.55 0.82 0.2  0.86 0.35 0.75
  0.3  0.88]
 [0.33 0.17 0.39 0.09 0.82 0.15 0.38 0.94 0.99 0.46 0.83 0.25 0.6  0.9
  0.53 0.59]
 [0.04 0.36 0.08 0.31 0.33 0.77 0.04 0.43 0.31 0.64 0.35 0.04 0.88 0.76
  0.88 0.42]
 [0.61 0.51 0.6  0.26 0.3  0.03 0.3  0.24 0.56 0.57 0.48 0.29 0.06 0.98
  0.34 0.5 ]
 [0.98 0.44 0.32 0.52 0.58 0.85 0.07 0.46 0.78 0.72 0.59 0.04 0.35 0.56
  0.3  0.51]
 [0.67 0.16 0.05 0.34 0.11 0.18 0.89 0.37 0.22 0.75 0.11 0.74 0.47 0.6
  0.15 0.18]
 [0.65 0.05 0.25 0.54 0.23 0.38 0.92 0.93 0.57 0.53 0.01 0.98 0.57 0.79
  0.56 0.88]
 [0.58 0.71 0.15 0.43 0.69 0.1  0.44 0.17 0.51 0.82 0.09 0.8  0.57 0.59
  0.2  0.44]
 [0.3  0.04 0.03 0.45 0.74 0.56 0.39 0.17 0.84 0.6  0.78 0.85 0.6  0.78
  0.62 0.02]
 [0.75 0.18 0.46 0.51 0.48 0.84 0.17 0.01 0.85 0

We multiply input with its corresponding weight and sum all of them. \
Because we have arranged X as examples by features, we multiply W by X. \
This Neuron Activation results in a linear Affine Transform of X

In [18]:
X_prime.shape

(303, 13)

In [19]:
affine = np.matmul(X_prime, W1)+b1
affine.shape

(303, 16)

An activation function in a neural network defines how the weighted sum of the input is transformed into an output from a node. 
<br> This is known as <i>Neuron Transfer</i>

In [20]:
y1 = np.maximum(affine, 0) #ReLU Activation

In [21]:
#y1

y1 now becomes the input for the next layer

For the Hidden Layers, the shape of W depends on the number of nodes in layer n-1, and layer n

In [22]:
y1.shape

(303, 16)

In [23]:
np.random.seed(10)
W2 = np.random.uniform(low=0, high=INITWEIGHT, size=(y1.shape[1], NODES)).round(2) 
b2 = np.random.uniform(low=0, high=INITWEIGHT, size=NODES).round(2)
y2 = np.maximum(np.matmul(y1, W2)+b2, 0)
y2.shape

(303, 16)

In [24]:
np.random.seed(45)
W3 = np.random.uniform(low=0, high=INITWEIGHT,size=(y2.shape[1], CLASSES)).round(2) 
b3 = np.random.uniform(low=0, high=INITWEIGHT, size=CLASSES).round(2) 
W3.shape


(16, 1)

In [25]:
def sigmoid(x):
    """
        converts hidden units into confidence indices
    """
    return 1/(1 + np.exp(-x))

In [26]:
sigmoid(-.496)

0.378481143072389

In [27]:
sigmoid(14.934)

0.9999996732270753

In [28]:
#np.matmul(y2, W3)+b3

In [29]:
output = sigmoid(np.matmul(y2, W3)+b3)
output.shape

(303, 1)

In [30]:
#output

# Calculate Loss

### **Maximum Likelihood** 

<p>To estimate the error of a set weights in a neural network, we prefer functions that optimization algorithms like SGD can easily descend. </p>

<p><i>Maximum Likelihood</i> seeks to find the optimum values for parameters by maximizing  a likelihood function derived from the training data. <br>We are minimizing the dissimilarity between the empirical distribution found in the training set and the model distribution, as defined by KL Divergence. <br>Minimizing KL Divergence corresponds exactly to minimizing the cross-entropy between the distributions. </p>

### Loss Function: Binary Cross Entropy

$$
H_p(q) = -\frac{1}{N} \sum_{i=1}^{N} y_i * \log(p(y_i)) + (1-yi) * \log(1-p(y_i))
$$

<p> We seek a set of model weights that minimize the difference between the model's predicted probability distribution and the distribution of probabilities in the training dataset.<br><br><i> This is called cross-entropy.</i>

In [31]:
def binary_cross_entropy(y_logit, y):
    """
        the choice of loss is directly related to activation function of a neural network
    """
    
    bce = y * np.log(y_logit) + (1 - y) * np.log(1 - y_logit)
    return np.mean(-bce)

In [32]:
loss = binary_cross_entropy(output, y)

  bce = y * np.log(y_logit) + (1 - y) * np.log(1 - y_logit)
  bce = y * np.log(y_logit) + (1 - y) * np.log(1 - y_logit)


In [33]:
X_prime[:16].shape

(16, 13)

### Back Prop

In [35]:
np.random.seed(45)

class FeedForwardMultiLayerPerceptron(object):
    """
        Multi Layer Perceptron for classification
    """

    def __init__(
        self, 
        x, 
        y,
        hidden_layers:int, 
        hidden_units:int, 
        output_nodes:int, 
        init_weight=.3, 
        round_n=2):
        """
            initializes an MLP
            x is an numpy array of training data
            output_nodes is the number of classes
        """
        #self.optimizer = 
        #self.network = list()


        self.inputs = x
        self.labels = y
        self.examples = self.inputs.shape[0]
        self.features = self.inputs.shape[1]

        self.init_weight = init_weight
        
        self.hidden_nodes = hidden_units
        self.output_nodes = output_nodes
        
        self.input_layer = (
            np.random.uniform(low=0, high=self.init_weight, size=(self.features, self.hidden_nodes)).round(round_n),
            np.random.uniform(low=0, high=self.init_weight, size=self.hidden_nodes).round(round_n)
        )

        self.hidden_layers = [
            (
                np.random.uniform(low=0, high=self.init_weight, size=(self.hidden_nodes, self.hidden_nodes)).round(round_n),
                np.random.uniform(low=0, high=self.init_weight, size=self.hidden_nodes).round(round_n)
            ) for layer in range(hidden_layers)
        ]

        self.output_layer = (
            np.random.uniform(low=0, high=self.init_weight,size=(self.hidden_nodes, self.output_nodes)).round(round_n),
            np.random.uniform(low=0, high=self.init_weight, size=self.output_nodes).round(round_n) 

        )


        self.network = list()
        self.network.append(self.input_layer)
        for layer in self.hidden_layers:
            self.network.append(layer)
        self.network.append(self.output_layer)


    def __str__(self):
        return f'"input layer shape":\n{self.input_layer.shape},\n"hidden layers":\n{self.hidden_layers},\n"output layer":{self.output_layer}"' 
        

    def ReLU_transfer(self, x):
        return np.maximum(x, 0)


    def sigmoid(self, x):
        """
        """
        return 1/(1 + np.exp(-x))


    # def sigmoid_derivative(self, x):
    #     """
    #     """
	#     return x * (1.0 - x)


    def loss(self, y_logit, y):
        """
            the choice of loss is directly related to activation function of a neural network
            this loss function is binary cross entropy, as our activation function is sigmoid
        """
        bce = y * np.log(y_logit) + (1 - y) * np.log(1 - y_logit)
        return np.mean(-bce)


    def forward(self):
        """
        """

        forward_pass = np.matmul(self.inputs, self.input_layer[0]) + self.input_layer[1] 
        forward_pass = self.sigmoid(forward_pass)


        for weight, bias in self.hidden_layers:
            forward_pass = np.matmul(forward_pass, weight) + bias
            forward_pass = self.sigmoid(forward_pass)


        forward_pass = np.matmul(forward_pass, self.output_layer[0]) + self.output_layer[1]

        self.pass_probs = self.sigmoid(forward_pass)
        self.pass_loss = self.loss(forward_pass, self.labels)
        print(f"Pass Loss: {self.pass_loss}")
        return self.pass_loss, self.pass_probs

    # # Backpropagate error and store in neurons
    # def backward(self):

    #     for i in reversed(range(len(network))):
    #         layer = network[i]
    #         errors = list()
    #         if i != len(network)-1:
    #             for j in range(len(layer)):
    #                 error = 0.0
    #                 for neuron in network[i + 1]:
    #                     error += (neuron['weights'][j] * neuron['delta'])
    #                 errors.append(error)
    #         else:
    #             for j in range(len(layer)):
    #                 neuron = layer[j]
    #                 errors.append(neuron['output'] - expected[j])
    #         for j in range(len(layer)):
    #             neuron = layer[j]
    #             neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
    #     #def accuracy(self):




network = FeedForwardMultiLayerPerceptron(
    x = X_prime,
    y = y,
    hidden_layers = 1,
    hidden_units = 8,
    output_nodes = CLASSES,
)

loss, probs = network.forward()

AttributeError: 'FeedForwardMultiLayerPerceptron' object has no attribute 'sigmoid'