# Vanilla Neural Network from scratch

## Packages


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image # To grab the images and extract useful information

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

np.random.seed(42) # Set random seed

## Data


Read in the labels and file information

In [None]:
# Set the dataset directory
dataset_dir = os.getcwd() + "/train_selected"

# Get the data labels
labels_file = dataset_dir + "/train_selected.csv"
data_labels = pd.read_csv(labels_file)

data_labels.shape

In [None]:
# Get X files
file_list = [dataset_dir + "/" + str(x) + ".png" for x in list(data_labels["id"])]

In [None]:
# Get the labels
data_labels["class"] = np.where(data_labels['label']=='automobile', 1, 0)
data_labels["class"].value_counts()

In [None]:
## TO DO ##

# Create a function that will standardise the dataset
# Replace False

def standarise_data(dataset):
    
    new_dataset = dataset/255.
    
    return new_dataset

In [None]:
def load_data():
    global X_train, X_test, y_train, y_test, X, y
    
    X = np.array([np.array(Image.open(fname)) for fname in file_list])
    y = np.array(data_labels["class"])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    y_train = y_train.reshape(1, y_train.shape[0])
    y_test = y_test.reshape(1, y_test.shape[0])
    
    # Reshape the training and test examples 
    X_train_f = X_train.reshape(X_train.shape[0], -1).T
    X_test_f = X_test.reshape(X_test.shape[0], -1).T
    
    # Standardize data to have feature values between 0 and 1.
    X_train = standarise_data(X_train_f)
    X_test = standarise_data(X_test_f)
    

    print ("Flatten X_train: " + str(X_train.shape))
    print ("Flatten X_test: " + str(X_test.shape))
    
    print ("y_train: " + str(y_train.shape))
    print ("y_test: " + str(y_test.shape))
    
    return

In [None]:
load_data()

## Quick 'normal' ml

You will find the following resources useful:

* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
# Here is some quick code to explain the way sklearn does things
# Hint: the shape of your data is likely not correct for this
# You may find use out of the datetime.datetime module or the time module

from sklearn.linear_model import LogisticRegressionCV 
log_reg = LogisticRegressionCV()

#Fit to our model
log_reg.fit(X_train, y_train)

In [None]:
# Repeat the same for a tree-based model

## Initialization

In [None]:
## TO DO ##

# Correctly create the layer dimensions as per the brief
# Replace False

# For example layer_dimensions of [5,7,2,1] would be input 5, two hidden layers (7,2) and 1 in output

layer_dimensions = [False]
layer_dimensions

In [1]:
## TO DO ##

# Use your knowledge of parameter matrix size to edit the code. 
# Replace False

def initialise_parameters(layer_dimensions):
    """
    Input:
    layer_dimensions -- python (list), one item per layer, number representing size of layer
    
    Output:
    parameters -- python dictionary containing your weight and bias parameters "W1", "b1", ..., "WL", "bL" 
                  with appropriate sizes.
    """
    
    global parameters
    
    np.random.seed(42)
    parameters = {}
    L = len(layer_dimensions)         

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(False, False) * 0.01 
        parameters['b' + str(l)] = np.zeros((False, False))
    
    return parameters

In [None]:
# It would be a good idea to test your initialisation function here. These matrix sizes are important!

## Forward prop

Create a series of functions that will:

* Undertake the linear multiplication
* Underake the activation of the layer
* Store this somewhere for efficient computation of backprop

### Activations

We will need activations 

In [None]:
## TO DO ##

# Create a function that will undertake sigmoid activaiton
# Create another function that will undertake relu activation
# Replace False

def sigmoid(Z):
    """    
    Input:
    Z     -- numpy array of any shape
    
    Output:
    A     -- output of sigmoid(z), (should be same shape as Z!)
    cache -- returns Z as well, useful during backpropagation
    """
        
    A = False
    cache = Z
    
    return A, cache

def relu(Z):
    """    
    Input:
    Z     -- numpy array of any shape
    
    Output:
    A     -- output of relu(z), (should be same shape as Z!)
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = False
    
    cache = Z 
    
    return A, cache

### Forward Prop

In [None]:
## TO DO ##

# Create a function that will undertake the linear component of forward prop
# Replace False

def linear_forward(A, W, b):
    """
    Input:
    A     -- activations from previous layer
    W     -- weights matrix
    b     -- bias vector

    Output:
    Z     -- the input to activation function 
    cache -- a python dictionary with "A", "W" and "b" for backprop
    """
    
    Z = False
    
    cache = (A, W, b)
    
    return Z, cache

In [None]:
## TO DO ##

# This function conditionally calls an activation function. 
# Call the correction function above with the correct if statement
# Replace False

def activation_forward(A_prev, W, b, activation):
    """

    Input:
    A_prev     -- activations from previous layer
    W          -- weights matrix
    b          -- bias vector
    activation -- the activation type to be used ("sigmoid" or "relu")

    Output:
    A          -- the output of the activation function, also called the post-activation value 
    cache      -- a python dictionary with two two caches "linear_cache" and "activation_cache" for backprop
    """

    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = False
    
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = False
    
    ###NOTE###
    # This is where you can put more activation functions for the extension tasks
    
    cache = (linear_cache, activation_cache)

    return A, cache
    

In [None]:
## TO DO ##

# Architect the forward pass. 
# You will need to firstly determine how many layers there are
# You will then need to pull out the correct parameters we initalised
# Ensure you use the appropriate activation for the middle layers
# Pay special attention to the last layer
# It may help to print out parameters
# Replace False

def total_forward(X, parameters):
    """
    
    Input:
    X            -- raw data
    parameters   -- dictionary of initialised parameters, output from a particular function above.
    
    Returns:
    AL           -- last post-activation value
    caches       -- list of caches from forward activations
    """

    caches = []
    A = X
    L = False
    
    
    # All the layers up until the last (sigmoid) layer
    for l in range(1, L):
        A_prev = A 
        A, cache = activation_forward(A_prev, 
                                      parameters[False], 
                                      parameters[False], 
                                      activation = False)
        caches.append(cache)
    
    # The last layer - how do we use the sigmoid function?
    
    AL, cache = activation_forward(A, 
                                      parameters[False], 
                                      parameters[False], 
                                      activation = False)
    caches.append(cache)
            
    return AL, caches

## Back prop

### Backwards activations

In [None]:
## TO DO ##

# Differentiate the relu and the sigmoid functions
# Replace False

def relu_backward(dA, cache):
    """
    Input:
    dA      -- post-activation gradient
    cache   -- 'Z' that is used in the backwards prop here.

    Output:
    dZ      -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.copy(dA) # Copying dA first
    
    # What do you set dZ to when Z is what values? 
    dZ = False 
        
    return dZ

def sigmoid_backward(dA, cache):
    
    """
    Input:
    dA      -- post-activation gradient
    cache   -- 'Z' that is used in backprop here

    Returns:
    dZ      -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = False
        
    return dZ

### Linear Backwards

In [None]:
# You do not need to do anything here, but notes are included for your interest

def linear_backward(dZ, cache):
    """
    Input:
    dZ        -- Gradient of the cost with respect to 'Z' of current layer
    cache     -- (A_prev, W, b) from forward propag in the current layer, we stored this previously

    Output:
    dA_prev   -- Gradient of the cost w.r.t activation of previous layer
    dW        -- Gradient of the cost w.r.t W of current layer
    db        -- Gradient of the cost w.r.t b of current layer l
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    return dA_prev, dW, db

In [None]:
## TO DO ##

# Use the activation differentiation functions you created above
# Ensure you are putting the right arguments (hint: caches) into the functions
# For the first false, consider what function give back dZ? (what does it require?)
# Replace False

def activation_backward(dA, cache, activation):
    """
    Input:
    dA         -- post-activation gradient for current layer
    cache      -- (linear_cache, activation_cache) stored previously for backprop
    activation -- activation for this layer ("sigmoid" or "relu")
    
    Output:
    dA_prev   -- Gradient of the cost w.r.t activation of previous layer
    dW        -- Gradient of the cost w.r.t W of current layer
    db        -- Gradient of the cost w.r.t b of current layer l
    """
    
    linear_cache, activation_cache = cache

    
    if activation == "relu":
        dZ = False 
        dA_prev, dW, db = False
        
    elif activation == "sigmoid":
        dZ = False
        dA_prev, dW, db = False
    
    return dA_prev, dW, db


In [2]:
## TO DO ##

# Differentiate the loss function with respect to the last activation layer
# Replace False

def total_backward(AL, Y, caches):
    
    """
    
    Input:
    AL        -- probability vector, output of the forward propagation (L_model_forward())
    Y         -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches    -- list of caches from relu and sigmoid we kept from forward prop
    
    output:
    grads     -- A dictionary with the gradients named dA+l,dW+l, db+l for each layer
    """
    
    grads = {}
    L = len(layer_dimensions) - 1 
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    # Initializing the backpropagation
    dAL = False

    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = activation_backward(dAL, current_cache, activation = "sigmoid")
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

## Cost function

In [None]:
## TO DO ##

# Write a function to compute the binary logistic cost function ('cross entropy loss')
# This is on page 51 of the slides from block_1. 
# You may need to transpose elements to make the matrix calculations work
# Replace False

def compute_cost(AL, Y):
    """
    Input:
    AL    -- probability vector for label predictions
    Y     -- truth vector vector

    Output:
    cost  -- cost
    """
    
    m = Y.shape[1]
    
    # Compute loss from aL and y.
    cost_total =  False
    cost = (1./m) * cost_toal 
    
    cost = np.squeeze(cost) # Help with the shape
    
    return cost

## Update Parameters

In [None]:
## TO DO ##

# Update each parameter
# Remember what hyperparameter is important for this step?
# You will also find a useful, indexed value in the 'grads' dictionary created in backprop above
# Replace False

def update_parameters(parameters, grads, learning_rate):
    """
    
    Input:
    parameters    -- dictionary with parameters 
    grads         -- dictionary with gradients (which function outputs this?)
    learning_date -- step size to adjust parameters by
    
    Returns:
    parameters    -- dictionary containing your updated parameters , same structure as original parameters dict
    """
    
    L = len(parameters) // 2

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - False * False
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - False * False
        
    return parameters

## Put it all together

Create a function to knit together everything you have done so far and allow for different layer sizes and lengths to be used.

In [None]:
## TO DO ##

# Now stitch it all together. Essentially you will need to call all your functions in turn with the right arguments.
# Initialise parameters
# Undertake forward prop. What is our master function? Consider what we got from initialisation?
# Undertake backwards prop. Again consider our master function for back prop.
# Update parameters.
# Replace False

def total_backward_forward(X, Y, layers_dimensions, 
                           learning_rate, 
                           num_iterations, 
                           print_cost):
    
    """
    Input:
    X                 -- data
    Y                 -- truth vector (1,0)'s
    layers_dimensions -- list of dimensions for each layer of network
    learning_rate     -- step size for gradient descent
    num_iterations    -- number of training iterations to undertake
    print_cost        -- if True, it prints the cost every 100 steps
    
    output:
    parameters        -- parameters learnt by the model. Used to predict
    """

    np.random.seed(42)
    costs = []
    
    # Parameters initialization
    parameters = False
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation:
        AL, caches = False
        
        # Compute cost
        cost = False
    
        # Backward propagation.
        grads = False
 
        # Update parameters.
        parameters = False
                
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

In [None]:
parameters = total_backward_forward(X_train, 
                                    y_train, 
                                    layer_dimensions, 
                                    num_iterations = 1500, 
                                    print_cost = True)

## Predict (Hold out)

In [None]:
## TO DO ##

# Create your own predict function.
# Note the number of training examples
# Turn the probabilities into 0-1 predictions
# Replace False

def predict(X, y, parameters):
    """ 
    Input:
    X           -- data (test set)
    parameters  -- parameters of the trained model
    
    Output:
    p -- predictions for the given dataset X
    """
    
    m = False # How many training examples?
    n = len(parameters) // 2
    p = np.zeros((1,m)) # Initialise probabilities to zero
    
    # Forward propagation
    probas, caches = total_forward(X, parameters)
    
    # convert probas to 0/1 predictions. 
    p = False
        
    return p, probas

In [None]:
# Create some predictions
predictions, probas = predict(X_test, y_test, parameters)

In [None]:
# Make a scatter plot of probabilities. Good check if something is wrong
plt.scatter(range(len(probas[0])), probas)

In [None]:
# Check your prediction value counts
pred_df = pd.DataFrame(predictions, columns=["prediction"])
pred_df.prediction.value_counts()

In [None]:
# Do a bit of reshaping
predictions_sk = predictions.reshape(len(predictions), 1)
print(predictions_sk.shape)

y_test_sk = y_test.T
print(y_test_sk.shape)

In [None]:
# Build some sklearn scores

#Get confusion matrix 
print("Confustion Matrix \n", confusion_matrix(list(y_test_sk), list(predictions_sk)))

#Get classification report
print(classification_report(y_test_sk, predictions_sk))

# Accuracy score
print("Accuracy: ", accuracy_score(y_test_sk, predictions_sk))

# ROC_AUC score
print("ROC_AUC: ", roc_auc_score(y_test_sk, probas.T))