# Vanilla Neural Network from scratch

## Packages


In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image # To grab the images and extract useful information

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

np.random.seed(42) # Set random seed

## Data


Read in the labels and file information

In [2]:
# Set the dataset directory
dataset_dir = os.getcwd() + "/train_selected"

# Get the data labels
labels_file = dataset_dir + "/train_selected.csv"
data_labels = pd.read_csv(labels_file)

data_labels.shape

(4870, 2)

In [3]:
# Get X files
file_list = [dataset_dir + "/" + str(x) + ".png" for x in list(data_labels["id"])]

In [4]:
# Get the labels
data_labels["class"] = np.where(data_labels['label']=='automobile', 1, 0)
data_labels["class"].value_counts()

0    4370
1     500
Name: class, dtype: int64

In [5]:
## TO DO ##

# Create a function that will standardise the dataset
# Replace False

def standarise_data(dataset):
    
    new_dataset = dataset/255.
    
    return new_dataset

In [6]:
def load_data():
    global X_train, X_test, y_train, y_test, X, y
    
    X = np.array([np.array(Image.open(fname)) for fname in file_list])
    y = np.array(data_labels["class"])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    y_train = y_train.reshape(1, y_train.shape[0])
    y_test = y_test.reshape(1, y_test.shape[0])
    
    # Reshape the training and test examples 
    X_train_f = X_train.reshape(X_train.shape[0], -1).T
    X_test_f = X_test.reshape(X_test.shape[0], -1).T
    
    # Standardize data to have feature values between 0 and 1.
    X_train = standarise_data(X_train_f)
    X_test = standarise_data(X_test_f)
    

    print ("Flatten X_train: " + str(X_train.shape))
    print ("Flatten X_test: " + str(X_test.shape))
    
    print ("y_train: " + str(y_train.shape))
    print ("y_test: " + str(y_test.shape))
    
    return

In [7]:
load_data()

Flatten X_train: (3072, 3409)
Flatten X_test: (3072, 1461)
y_train: (1, 3409)
y_test: (1, 1461)


## Quick 'normal' ml

You will find the following resources useful:

* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [8]:
X_train_clf = X_train.T
X_test_clf = X_test.T

y_train_clf = y_train.T.ravel()
y_test_clf = y_test.T.ravel()

print(X_train_clf.shape, X_test_clf.shape, y_train_clf.shape, y_test_clf.shape)

(3409, 3072) (1461, 3072) (3409,) (1461,)


In [9]:
from sklearn.linear_model import LogisticRegressionCV 
import datetime

C_list = np.linspace(0.001, 0.5, 20)
log_reg = LogisticRegressionCV(
    Cs=C_list, cv=10, penalty='l2', scoring='roc_auc', solver='liblinear', tol =1e-4, max_iter=1000, 
    class_weight='balanced', n_jobs=7, verbose=2, refit=True, multi_class='ovr', random_state=42
)

#Fit to our model
start = datetime.datetime.now()
log_reg.fit(X_train_clf, y_train_clf)
end = datetime.datetime.now()
print("Total time taken: {}".format(end - start))

KeyboardInterrupt: 

In [11]:
#Predict the class
y_test_clf = pd.DataFrame(y_test_clf, columns=["actual"])
y_test_clf["predictions_lr"] = log_reg.predict(X_test_clf)

In [12]:
# Get confusion matrix 
print("Confustion Matrix \n", confusion_matrix(y_test_clf.actual, y_test_clf.predictions_lr))

# Get classification report
print(classification_report(y_test_clf.actual, y_test_clf.predictions_lr))

# Get ROC-AUC
print("ROC-AUC Score \n", roc_auc_score(y_test_clf.actual, y_test_clf.predictions_lr))

# Get accuracy
print("Accuracy Score \n", accuracy_score(y_test_clf.actual, y_test_clf.predictions_lr))

Confustion Matrix 
 [[1154  143]
 [  53  111]]
             precision    recall  f1-score   support

          0       0.96      0.89      0.92      1297
          1       0.44      0.68      0.53       164

avg / total       0.90      0.87      0.88      1461

ROC-AUC Score 
 0.7832874174925251
Accuracy Score 
 0.865845311430527


In [48]:
# A tree based example

from sklearn.ensemble import RandomForestClassifier
import datetime

#Create the model object
rf_class = RandomForestClassifier(
    n_estimators=1000, criterion='entropy', 
    max_depth=15, min_samples_split=3, bootstrap=True, oob_score=True, 
    n_jobs=7, random_state=42, verbose=1, class_weight='balanced' 
)

#Fit to our model
start = datetime.datetime.now()
rf_class.fit(X_train_clf, y_train_clf)
end = datetime.datetime.now()
print("Total time taken: {}".format(end - start))

[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.6s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    2.8s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:    6.3s
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:   11.0s
[Parallel(n_jobs=7)]: Done 1000 out of 1000 | elapsed:   13.8s finished


Total time taken: 0:00:18.865685


In [14]:
#Predict the class
y_test_clf["predictions_rf"] = rf_class.predict(X_test_clf)

[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:    0.2s
[Parallel(n_jobs=7)]: Done 1000 out of 1000 | elapsed:    0.3s finished


In [15]:
# Get confusion matrix 
print("Confustion Matrix \n", confusion_matrix(y_test_clf.actual, y_test_clf.predictions_rf))

# Get classification report
print(classification_report(y_test_clf.actual, y_test_clf.predictions_rf))

# Get ROC-AUC
print("ROC-AUC Score \n", roc_auc_score(y_test_clf.actual, y_test_clf.predictions_rf))

# Get accuracy
print("Accuracy Score \n", accuracy_score(y_test_clf.actual, y_test_clf.predictions_rf))

Confustion Matrix 
 [[1297    0]
 [ 151   13]]
             precision    recall  f1-score   support

          0       0.90      1.00      0.94      1297
          1       1.00      0.08      0.15       164

avg / total       0.91      0.90      0.86      1461

ROC-AUC Score 
 0.5396341463414634
Accuracy Score 
 0.8966461327857632


## Initialization

In [9]:
## TO DO ##

# Correctly create the layer dimensions as per the brief
# Replace False

# For example layer_dimensions of [5,7,2,1] would be input 5, two hidden layers (7,2) and 1 in output

layer_dimensions = [3072,10,25,10,1]
layer_dimensions

[3072, 10, 25, 10, 1]

In [10]:
## TO DO ##

# Use your knowledge of parameter matrix size to edit the code. 
# Replace False

def initialise_parameters(layer_dimensions):
    """
    Input:
    layer_dimensions -- python (list), one item per layer, number representing size of layer
    
    Output:
    parameters -- python dictionary containing your weight and bias parameters "W1", "b1", ..., "WL", "bL" 
                  with appropriate sizes.
    """
    
    global parameters
    
    np.random.seed(42)
    parameters = {}
    L = len(layer_dimensions)         

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dimensions[l], layer_dimensions[l-1]) * 0.01 
        parameters['b' + str(l)] = np.zeros((layer_dimensions[l],1))
    
    return parameters

In [11]:
# It would be a good idea to test your initialisation function here. These matrix sizes are important!

parameters = initialise_parameters((layer_dimensions))
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.00496714 -0.00138264  0.00647689 ...  0.00692723 -0.0126933
   0.01702515]
 [ 0.00202329  0.01631857 -0.00733033 ... -0.00045929 -0.00016568
   0.00683325]
 [ 0.00590744 -0.00220114  0.00063649 ...  0.00532892 -0.00617286
   0.01202249]
 ...
 [-0.00046009  0.00611291  0.0023457  ...  0.00043588  0.0051953
  -0.00264349]
 [ 0.00516406 -0.00085402  0.01011368 ...  0.00567622 -0.01582958
  -0.00766962]
 [ 0.00289715 -0.01204151 -0.0066031  ... -0.01609538  0.01883204
  -0.00683668]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
W2 = [[ 1.41266780e-02 -2.21370910e-03 -1.13080420e-02  5.09068390e-03
  -3.91056857e-03 -2.07101106e-03  1.44905934e-03 -8.12212836e-03
  -1.62113635e-02 -5.12696147e-04]
 [ 9.63917703e-03 -9.01591259e-03  1.53962266e-03  1.50550715e-02
  -8.97648873e-04  8.60943718e-04  5.52123536e-04 -6.82344426e-03
  -9.93354518e-03 -9.89262369e-03]
 [ 3.68183197e-03  7.57872362e-03  8.05082238e-03 -3.78542854e-03
   1.09330854e-02  3.07363820e-03 

## Forward prop

Create a series of functions that will:

* Undertake the linear multiplication
* Underake the activation of the layer
* Store this somewhere for efficient computation of backprop

### Activations

We will need activations 

In [12]:
## TO DO ##

# Create a function that will undertake sigmoid activaiton
# Create another function that will undertake relu activation
# Replace False

def sigmoid(Z):
    """    
    Input:
    Z     -- numpy array of any shape
    
    Output:
    A     -- output of sigmoid(z), (should be same shape as Z!)
    cache -- returns Z as well, useful during backpropagation
    """
        
    A = 1/(1+np.exp(-Z)) #Added Sigmoid
    cache = Z
    
    return A, cache

def relu(Z):
    """    
    Input:
    Z     -- numpy array of any shape
    
    Output:
    A     -- output of relu(z), (should be same shape as Z!)
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = np.maximum(0,Z)
    
    cache = Z 
    
    return A, cache

### Forward Prop

In [13]:
## TO DO ##

# Create a function that will undertake the linear component of forward prop
# Replace False

def linear_forward(A, W, b):
    """
    Input:
    A     -- activations from previous layer
    W     -- weights matrix
    b     -- bias vector

    Output:
    Z     -- the input to activation function 
    cache -- a python dictionary with "A", "W" and "b" for backprop
    """
    
    Z = np.dot(W, A) + b #multply the activations by the weights and add the bias
    
    cache = (A, W, b)
    
    return Z, cache

In [14]:
## TO DO ##

# This function conditionally calls an activation function. 
# Call the correction function above with the correct if statement
# Replace False

def activation_forward(A_prev, W, b, activation):
    """

    Input:
    A_prev     -- activations from previous layer
    W          -- weights matrix
    b          -- bias vector
    activation -- the activation type to be used ("sigmoid" or "relu")

    Output:
    A          -- the output of the activation function, also called the post-activation value 
    cache      -- a python dictionary with two two caches "linear_cache" and "activation_cache" for backprop
    """

    
    ###NOTE###
    # This is where you can put more activation functions for the extension tasks
    
    cache = (linear_cache, activation_cache)


    return A, cache
    

In [15]:
## TO DO ##

# Architect the forward pass. 
# You will need to firstly determine how many layers there are
# You will then need to pull out the correct parameters we initalised
# Ensure you use the appropriate activation for the middle layers
# Pay special attention to the last layer
# It may help to print out parameters
# Replace False

def total_forward(X, parameters):
    """
    
    Input:
    X            -- raw data
    parameters   -- dictionary of initialised parameters, output from a particular function above.
    
    Returns:
    AL           -- last post-activation value
    caches       -- list of caches from forward activations
    """

    caches = []
    A = X
    L = len(layer_dimensions) - 1
    
    # All the layers up until the last (sigmoid) layer
    for l in range(1, L):
        A_prev = A 
        A, cache = activation_forward(A_prev, 
                                      parameters['W' + str(l)], 
                                      parameters['b' + str(l)],
                                       activation='relu')
        caches.append(cache)
    
    # The last layer - how do we use the sigmoid function?
    
    AL, cache = activation_forward(A, 
                                   parameters['W' + str(l+1)], 
                                   parameters['b' + str(l+1)],  
                                   activation='sigmoid')
    caches.append(cache)
            
    return AL, caches

## Back prop

### Backwards activations

In [16]:
## TO DO ##

# Differentiate the relu and the sigmoid functions
# Replace False

def relu_backward(dA, cache):
    """
    Input:
    dA      -- post-activation gradient
    cache   -- 'Z' that is used in the backwards prop here.

    Output:
    dZ      -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.copy(dA) # Copying dA first
    
    # What do you set dZ to when Z is what values? 
    dZ[Z <= 0] = 0;
    
    assert (dZ.shape == Z.shape);
        
    return dZ

def sigmoid_backward(dA, cache):
    
    """
    Input:
    dA      -- post-activation gradient
    cache   -- 'Z' that is used in backprop here

    Returns:
    dZ      -- Gradient of the cost with respect to Z
    """
    
    Z = cache
 
    dZ = dA * Z * (1 - Z)
    
    assert (dZ.shape == Z.shape)
    
        
    return dZ

### Linear Backwards

In [17]:
# You do not need to do anything here, but notes are included for your interest

def linear_backward(dZ, cache):
    """
    Input:
    dZ        -- Gradient of the cost with respect to 'Z' of current layer
    cache     -- (A_prev, W, b) from forward propag in the current layer, we stored this previously

    Output:
    dA_prev   -- Gradient of the cost w.r.t activation of previous layer
    dW        -- Gradient of the cost w.r.t W of current layer
    db        -- Gradient of the cost w.r.t b of current layer l
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    return dA_prev, dW, db

In [18]:
## TO DO ##

# Use the activation differentiation functions you created above
# Ensure you are putting the right arguments (hint: caches) into the functions
# For the first false, consider what function give back dZ? (what does it require?)
# Replace False

def activation_backward(dA, cache, activation):
    """
    Input:
    dA         -- post-activation gradient for current layer
    cache      -- (linear_cache, activation_cache) stored previously for backprop
    activation -- activation for this layer ("sigmoid" or "relu")
    
    Output:
    dA_prev   -- Gradient of the cost w.r.t activation of previous layer
    dW        -- Gradient of the cost w.r.t W of current layer
    db        -- Gradient of the cost w.r.t b of current layer l
    """
    
    linear_cache, activation_cache = cache

    
    if activation == "relu":
        dZ = relu_backward(dA,activation_cache)
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA,activation_cache)
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
    
    return dA_prev, dW, db


In [19]:
## TO DO ##

# Differentiate the loss function with respect to the last activation layer
# Replace False

def total_backward(AL, Y, caches):
    
    """
    
    Input:
    AL        -- probability vector, output of the forward propagation (L_model_forward())
    Y         -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches    -- list of caches from relu and sigmoid we kept from forward prop
    
    output:
    grads     -- A dictionary with the gradients named dA+l,dW+l, db+l for each layer
    """
    
    grads = {}
    L = len(layer_dimensions) - 1 
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = activation_backward(dAL, current_cache, activation = "sigmoid")

    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

## Cost function

In [20]:
## TO DO ##

# Write a function to compute the binary logistic cost function ('cross entropy loss')
# This is on page 51 of the slides from block_1. 
# You may need to transpose elements to make the matrix calculations work
# Replace False

def compute_cost(AL, Y):
    """
    Input:
    AL    -- probability vector for label predictions
    Y     -- truth vector vector

    Output:
    cost  -- cost
    """
    
    m = Y.shape[1]
    
    # Compute loss from aL and y.
    cost_total =  (np.dot(Y, np.log(AL).T) + np.dot(1 - Y, np.log(1 - AL).T))
                     
    cost = (1./m) * cost_total 
    
    cost = np.squeeze(cost) # Help with the shape
    
    return cost

## Update Parameters

In [21]:
## TO DO ##

# Update each parameter
# Remember what hyperparameter is important for this step?
# You will also find a useful, indexed value in the 'grads' dictionary created in backprop above
# Replace False

def update_parameters(parameters, grads, learning_rate):
    """
    
    Input:
    parameters    -- dictionary with parameters 
    grads         -- dictionary with gradients (which function outputs this?)
    learning_date -- step size to adjust parameters by
    
    Returns:
    parameters    -- dictionary containing your updated parameters , same structure as original parameters dict
    """
    
    L = len(parameters) // 2
    learning_rate = 0.1

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW"+ str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db"+ str(l+1)]
        
    return parameters, learning_rate

## Put it all together

Create a function to knit together everything you have done so far and allow for different layer sizes and lengths to be used.

In [22]:
## TO DO ##

# Now stitch it all together. Essentially you will need to call all your functions in turn with the right arguments.
# Initialise parameters
# Undertake forward prop. What is our master function? Consider what we got from initialisation?
# Undertake backwards prop. Again consider our master function for back prop.
# Update parameters.
# Replace False

def total_backward_forward(X, Y, layers_dimensions,  
                           num_iterations, 
                           print_cost):
    
    """
    Input:
    X                 -- data
    Y                 -- truth vector (1,0)'s
    layers_dimensions -- list of dimensions for each layer of network
    learning_rate     -- step size for gradient descent
    num_iterations    -- number of training iterations to undertake
    print_cost        -- if True, it prints the cost every 100 steps
    
    output:
    parameters        -- parameters learnt by the model. Used to predict
    """

    np.random.seed(42)
    costs = []
    
    # Parameters initialization
    parameters = initialise_parameters(layer_dimensions)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation:
        AL, caches = total_forward(X, parameters)
        
        # Compute cost
        cost = compute_cost(AL, Y)
    
        # Backward propagation.
        grads = total_backward(AL, Y, caches)
 
        # Update parameters.
        parameters = update_parameters(parameters, grads, 0.1)
                
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

In [23]:
parameters = total_backward_forward(X_train, 
                                    y_train, 
                                    layer_dimensions, 
                                    num_iterations = 1500, 
                                    print_cost = True)

Cost after iteration 0: -0.693151


TypeError: tuple indices must be integers or slices, not str

## Predict (Hold out)

In [24]:
## TO DO ##

# Create your own predict function.
# Note the number of training examples
# Turn the probabilities into 0-1 predictions
# Replace False

def predict(X, y, parameters):
    """ 
    Input:
    X           -- data (test set)
    parameters  -- parameters of the trained model
    
    Output:
    p -- predictions for the given dataset X
    """
    
    m = Y.shape[1] # How many training examples?
    n = len(parameters) // 2
    p = np.zeros((1,m)) # Initialise probabilities to zero
    
    # Forward propagation
    probas, caches = total_forward(X, parameters)
    
    # convert probas to 0/1 predictions. 
    p = (probas < 0.5).astype(np.int)
        
    return p, probas

In [25]:
# Create some predictions
predictions, probas = predict(X_test, y_test, parameters)

NameError: name 'Y' is not defined

In [None]:
# Make a scatter plot of probabilities. Good check if something is wrong
plt.scatter(range(len(probas[0])), probas)

In [None]:
# Check your prediction value counts
pred_df = pd.DataFrame(predictions, columns=["prediction"])
pred_df.prediction.value_counts()

In [None]:
# Do a bit of reshaping
predictions_sk = predictions.reshape(len(predictions), 1)
print(predictions_sk.shape)

y_test_sk = y_test.T
print(y_test_sk.shape)

In [None]:
# Build some sklearn scores

#Get confusion matrix 
print("Confustion Matrix \n", confusion_matrix(list(y_test_sk), list(predictions_sk)))

#Get classification report
print(classification_report(y_test_sk, predictions_sk))

# Accuracy score
print("Accuracy: ", accuracy_score(y_test_sk, predictions_sk))

# ROC_AUC score
print("ROC_AUC: ", roc_auc_score(y_test_sk, probas.T))