# Practical Part: Neural Network Implementation & Experiments

Team:
* Jonathan Bhimani-Burrows (20178260)
* Arlie Coles (20121051)
* Yue (Violet) Guo (20120727)

In [65]:
import utils.mnist_reader as mnist_reader
import numpy as np
import math
import copy 
import matplotlib.pyplot as plt
import time

In [66]:
X_train, y_train = mnist_reader.load_mnist('data/mnist', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/mnist', kind='t10k')
X_valid = X_test[0:5000]
y_valid = y_test[0:5000]
X_test = X_test[5000:10000]
y_test = y_test[5000:10000]
print(y_test.shape)

(5000,)


In [78]:
def convertTarget(targetValues):
    # Convert to one-hot encoding
    numClasses = np.max(targetValues) + 1
    return np.eye(numClasses)[targetValues]

In [79]:
# convert the targets to one hot
y_valid = convertTarget(y_valid)
y_test = convertTarget(y_test)
y_train = convertTarget(y_train)

In [67]:
class BatchSampler(object):
    '''
    randomly sample batches without replacement.
    '''
    
    def __init__(self, data, targets, batch_size):
        self.num_points = data.shape[0]
        self.features = data.shape[1]
        self.data = data
        self.targets = targets
        self.batch_size = batch_size
        self.indices = np.arange(self.num_points)

    def get_batch(self, K = None):
        '''
        Get a random batch without replacement 
        '''
        
        if not K:
            indices = np.random.choice(self.indices, self.batch_size, replace=False)
        else:
            indices = np.arange(K)
        X_batch = np.take(self.data, indices, 0)
        y_batch = self.targets[indices]
        return X_batch, y_batch
    
        

In [68]:
# Our own activation functions

def relu(pre_activation):
    '''
    preactivation is a vector
    '''
    relu_output = np.zeros(pre_activation.shape)
    relu_flat = relu_output.flatten()
    for i, neuron in enumerate(pre_activation.flatten()):
        if neuron > 0:
            relu_flat[i] = neuron
    relu_output = relu_flat.reshape(pre_activation.shape)
    return relu_output

def softmax_single(pre_activation):
    '''
    Numerically stable because subtracting the max value makes bit overflow impossible,
    we will only have non-positive values in the vector
    '''
    exps = np.exp(pre_activation - np.max(pre_activation))
    return exps / np.sum(exps)

def softmax_multiple(pre_activation):
    '''
    Numerically stable because subtracting the max value makes bit overflow impossible,
    we will only have non-positive values in the vector
    '''
    exps = np.exp(pre_activation - np.max(pre_activation, axis = 0))
    return exps / np.sum(exps, axis = 0)

In [69]:
w1_fixed = np.array([[-0.39751495,  0.00628629],
                        [-0.0205684,   0.26683984],
                        [0.59675625 , 0.13841242],
                        [-0.25439437 , 0.17629986],
                        [0.49765604 , 0.6328421 ]])
w2_fixed = np.array( [[-0.41283729,  0.30420197, -0.26925985 , 0.1972228 , -0.23960543],
                        [-0.07794628,  0.05534963,  0.39403587 , 0.02447081, -0.20876543],
                        [ 0.12031748, -0.15724041,  0.07004474 , 0.01330072,  0.37325964],
                        [-0.13685212,  0.2450681 , -0.1039663 , -0.43493921,  0.18092754],
                        [-0.29733443,  0.39217373,  0.21700504, -0.20839457,  0.08478064],
                        [-0.05650999, -0.21730141, -0.20041823, -0.03229149,  0.41680238]])
w3_fixed = np.array( [[ 0.18862318, -0.26977775 , 0.10358298,  0.04400272,  0.30614878, -0.00911728],
                        [ 0.17972289,  0.19698613,  0.22084821, -0.07767053, -0.32146514, -0.19307932]])


In [70]:
def classErr(target, predicted):
    '''
    not class dependent
    target must NOT be in one hot
    '''
    cnt = 0
    #print("in class Err")
    #print("target \n", target.shape[0])
    #print("predicted \n", predicted.shape[0])
    #print("target range \n", np.max(target))
    #print("predicted range \n", np.max(predicted))
    for i in range(target.shape[0]):
        if target[i] != predicted [i]:
            cnt +=1
    return float(cnt) / target.shape[0]

In [71]:
def zero_init(d, dh1, dh2, m):
    W_1 = np.zeros((dh1*d))
    W_2 = np.zeros((dh2*dh1))
    W_3 = np.zeros((dh2*m))
    return W_1, W_2, W_3

def glorot_init(d, dh1, dh2, m):
    dl_1 = np.sqrt((6/(d + dh1)))
    W_1 = np.random.uniform((-1)*dh1, dh1, (dh1, d))

    dl_2 = np.sqrt(6/(dh1 + dh2))
    W_2 = np.random.uniform((-1)*dh2, dh2, (dh2, dh1))

    dl_3 = np.sqrt(6/(dh2 + m))
    W_3 = np.random.uniform((-1)*dh3, dh3, (m, dh2))

    return W_1, W_2, W_3

def normal_init(d, dh1, dh2, m):
    W_1 = np.random.normal(0, 1, (dh1, d))
    W_2 = np.random.normal(0, 1, (dh2, dh1))
    W_3 = np.random.normal(0, 1, (m, dh2))
    print((W_1).shape)
    return W_1, W_2, W_3


In [72]:
class neuralNet():
    def __init__(self, d, hidden_dims, m, n, init_mode='uniform', eta=3e-4, regularize=None, fixed=False):
        self.inputDim = d #inputDim
        self.hiddenDim = hidden_dims
        self.outputDim = m #outputDim
        self.regularize = regularize # lambda value
        self.learningRate = eta
        self.numData = n
        self.batchErrorGradients = []
        #may use xavier init - maybe explore this later.
        
        # Initial weights and biases
        if fixed:
            self.W_1 = w1_fixed
            self.W_2 = w2_fixed
            self.W_3 = w3_fixed
        elif init_mode == 'old':
            # from last year
            self.W_1 = np.random.uniform(-1/np.sqrt(d), 1/np.sqrt(d), 
                                         self.hiddenDim[0]*d).reshape(self.hiddenDim[0], d)
            self.W_2 = np.random.uniform(-1/np.sqrt(self.hiddenDim[0]), 1/np.sqrt(self.hiddenDim[0]), 
                                         self.hiddenDim[1]*self.hiddenDim[0]).reshape(self.hiddenDim[1],
                                                                                      self.hiddenDim[0]) 
            self.W_3 = np.random.uniform(-1/np.sqrt(self.hiddenDim[1]), 1/np.sqrt(self.hiddenDim[1]),
                                         self.hiddenDim[1]*m).reshape(m, self.hiddenDim[1]) 
        
        elif init_mode == 'normal_init':
            self.W_1, self.W_2, self.W_3 = normal_init(self.inputDim, self.hiddenDim[0], 
                                                       self.hiddenDim[1], self.outputDim)
        elif init_mode == 'glorot_init':
            self.W_1, self.W_2, self.W_3 = glorot_init(self.inputDim, self.hiddenDim[0], 
                                                       self.hiddenDim[1], self.outputDim)
        elif init_mode == 'zero_init':
            self.W_1, self.W_2, self.W_3 = zero_init(self.inputDim, self.hiddenDim[0], 
                                                       self.hiddenDim[1], self.outputDim)
        
        self.b_1 = np.zeros(self.hiddenDim[0]).reshape(self.hiddenDim[0],)
        self.b_2 = np.zeros(self.hiddenDim[1]).reshape(self.hiddenDim[1],)
        self.b_3 = np.zeros(m).reshape(m,)


    def fprop(self, batchData, mode='matrix'):
        '''
        a switch to work for both matrix and loop
        '''
        
        # hidden layer 1
        
        if mode == 'matrix':
            #print('self.b1', self.b_1.shape)
            #print('self.W_1', self.W_1.shape)
            #print('batchData.T', batchData.T.shape)
            stack_b1 = np.array([self.b_1,] * self.numData).T
            #print('stack_b1', stack_b1.shape)
            self.h_a1 = np.dot(self.W_1, batchData.T) + stack_b1
        elif mode == 'loop':
            self.h_a1 = np.dot(self.W_1, batchData.T) + self.b_1
            

        self.h_s1 = relu(self.h_a1)
        
        
        
        # hidden layer 2
        if mode == 'matrix':
            stack_b2 = np.array([self.b_2,] * self.numData).T
            self.h_a2 = np.dot(self.W_2, self.h_s1) + stack_b2
        elif mode == 'loop':
            self.h_a2 = np.dot(self.W_2, self.h_s1) + self.b_2

        self.h_s2 = relu(self.h_a2)
    
        #output layer weights    
        if mode == 'matrix':
            stack_b3 = np.array([self.b_3,] * self.numData).T
            self.o_a = np.dot(self.W_3, self.h_s2) + stack_b3
        elif mode == 'loop':
            self.o_a = np.dot(self.W_3, self.h_s2) + self.b_3           
        
        # softmax of weights
        if batchData.shape[0] == 1:
            print('using single softmax')
            self.o_s = softmax_single(self.o_a)
        else:
            self.o_s = softmax_multiple(self.o_a)
        
        # make predication 
        if mode == 'loop':
            self.prediction = np.argmax(self.o_s,axis = 0)
        elif mode == 'matrix':
            self.prediction = np.argmax(self.o_s,axis = 0)
        
    def errorRate(self, y, mode='matrix'):
        '''
        negative log
        -logO_s(x)
        HAD the indexing problem for matrix mode
        '''        
        
        if mode == 'loop':
            negLog = -self.o_a[np.argmax(y)] + np.log(np.sum(np.exp(self.o_a), axis=0))
            
        elif mode == 'matrix':
            negLog = []
            print("y.shape in error rate" , y.shape)
            for i in range(y.shape[1]):
                error_at_point = -self.o_a[np.argmax(y[:,i])][i] + np.log(np.sum(np.exp(self.o_a), axis=0))[i]
                negLog.append(error_at_point)
            negLog = np.array(negLog)
            negLog = np.mean(negLog)

        return negLog
          
    def bpropLoop(self, batchData, batchTarget):
        '''
        dimensions: 
        o_s: m x1
        grad_oa : m x 1
        hs: dh x 1
        grad_w2: m x dh
        grad_oa: m x n
        grad_b2: m x n
        grad_oa: m x n
        W(2): m x dh
        grad_hs: dh x n
        grad_oa: m x n
        grad_ha: dh x n
        x : n x d
        grad_W1: dh x d
        grad_ha: dh x n
        grad_b1: dh x n
        '''

        self.grad_oa = self.o_s - batchTarget
        # hidden layer 3
        self.grad_W3 = np.outer(self.grad_oa, self.h_s2.T)
        self.grad_b3 = self.grad_oa
        self.grad_hs2 = np.dot(self.W_3.T , self.grad_oa)
        h_a_stack2 = np.where(self.h_a2 > 0, 1, 0)
        self.grad_ha2 = np.multiply(self.grad_hs2, h_a_stack2)   

        # hidden layer 2
        self.grad_W2 = np.outer(self.grad_ha2, self.h_s1.T)
        self.grad_b2 = self.grad_ha2
        self.grad_hs1 = np.dot(self.W_2.T , self.grad_ha2)
        h_a_stack1 = np.where(self.h_a1 > 0, 1, 0)
        self.grad_ha1 = np.multiply(self.grad_hs1, h_a_stack1)
        
        # hidden layer 1        
        self.grad_W1 = np.outer(self.grad_ha1, batchData)
        self.grad_b1 = self.grad_ha1
        
        
    def bprop_matrix(self, batchData, batchTarget):
        '''
        backprop using matrix only
        '''
        
        self.grad_oa = self.o_s - batchTarget
        
        self.grad_W3 = np.matmul(self.grad_oa, self.h_s2.T)/batchData.shape[0] #!
        self.grad_b3 = np.sum(self.grad_oa, axis=1)/batchData.shape[0] #!
        self.grad_hs2 = np.matmul(self.W_3.T , self.grad_oa)
        self.grad_ha2 = np.multiply(self.grad_hs2, np.where(self.h_a2 > 0, 1.0, 0.0))

        
        self.grad_W2 = np.matmul(self.grad_ha2, self.h_s1.T)/batchData.shape[0] #!
        self.grad_b2 = np.sum(self.grad_ha2, axis =1)/batchData.shape[0]
        self.grad_hs1 = np.matmul(self.W_2.T, self.grad_ha2)
        self.grad_ha1 = np.multiply(self.grad_hs1, np.where(self.h_a1 > 0, 1.0, 0.0))
        
        self.grad_W1 = np.matmul(self.grad_ha1, batchData)/batchData.shape[0] #!
        self.grad_b1 = np.sum(self.grad_ha1, axis=1)/batchData.shape[0] #!
        
    def bprop(self, batchData, batchTarget, mode='matrix'):
        '''
        batchTarget already in one-hot format
        
        NOT working for a single point
        
        '''
        
        #batch target must be m by n
        self.grad_oa = self.o_s - batchTarget
        i = 0
        self.grad_W2 = [np.outer(self.grad_oa[:,i], self.h_s[:,i].T) for i in range(batchData.shape[0])]
        self.grad_b2 = self.grad_oa 
        self.grad_hs = np.dot(self.W_2.T , self.grad_oa)
        # Check this (dim mismatch maybe)
        h_a_stack = np.where(self.h_a > 0, 1, 0)
        self.grad_ha = np.multiply(self.grad_hs, h_a_stack)
        #self.grad_W1 = [np.outer(self.grad_ha[:,i], batchData[i]) for i in range(self.numData)]
        self.grad_W1 = [np.outer(self.grad_ha[:,i], batchData[i]) for i in range(batchData.shape[0])]
        # temporary hack for grad_W
        self.grad_b1 = self.grad_ha

        
        if mode == 'matrix':
            '''
            must avg, 
            1 pt would return a list of MAT/np array, not a NP array
            '''
            self.grad_W2 = np.average(np.array(self.grad_W2), axis=0)
            self.grad_b2 = np.average(np.array(self.grad_b2), axis=1)
            
            self.grad_W1 = np.average(np.array(self.grad_W1), axis=0)
            self.grad_b1 = np.average(np.array(self.grad_b1), axis=1)


        
    def updateParams(self):
        if self.regularize:
            self.W_1 -= (self.regularize[0] * np.sign(self.W_1) + 2 * self.regularize[1] * self.W_1) * self.learningRate
            self.W_2 -= (self.regularize[2] * np.sign(self.W_2) + 2 * self.regularize[3] * self.W_2) * self.learningRate
            self.W_3 -= (self.regularize[4] * np.sign(self.W_3) + 2 * self.regularize[5] * self.W_3) * self.learningRate

    
        self.W_1 -= self.grad_W1 * self.learningRate
        self.W_2 -= self.grad_W2 * self.learningRate
        self.W_3 -= self.grad_W3 * self.learningRate

        self.b_1 -= self.grad_b1 * self.learningRate
        self.b_2 -= self.grad_b2 * self.learningRate
        self.b_3 -= self.grad_b3 * self.learningRate

    def calculParam(self):
        """
        calculates the total amount of parameters
        """
        return (self.inputDim * self.hiddenDim[0] + self.hiddenDim[1] * self.hiddenDim[0]
                + self.outputDim * self.hiddenDim[1] + self.inputDim + self.hiddenDim[0] 
                + self.hiddenDim[1] + self.outputDim)
    
    
    def gradDescentLoop(self, batchData, batchTarget, K):
        # Call each example in the data (over the minibatches) in a loop
        grad_W3, grad_b3, grad_W2, grad_b2, grad_W1, grad_b1 = [], [], [], [], [], []
        predBatch = []
        for i in range(K):
            self.fprop(batchData[i], mode='loop') #batchTarget[:,i]
            self.bpropLoop(batchData[i],np.array(batchTarget[:,i]))
            predBatch.append(self.prediction)
            grad_W3.append(self.grad_W3)
            grad_b3.append(self.grad_b3)            
            grad_W2.append(self.grad_W2)
            grad_b2.append(self.grad_b2)
            grad_W1.append(self.grad_W1)
            grad_b1.append(self.grad_b1)

        self.grad_W3 = np.mean(np.array(grad_W3), axis=0) #! array
        self.grad_b3 = np.mean(np.array(grad_b3), axis=0) 
        self.grad_W2 = np.mean(np.array(grad_W2), axis=0) #! array
        self.grad_b2 = np.mean(np.array(grad_b2), axis=0) 
        self.grad_W1 = np.mean(np.array(grad_W1), axis=0) #! array
        self.grad_b1 = np.mean(np.array(grad_b1), axis=0)
        
        # Update params
        #self.updateParams()
    
    def fpropLoop(self, batchData, K):
        '''
        unlike the above def gradDescentLoop(self, batchData, batchTarget, K)
        this function only runs batchData (this is usually in test phase)
        through the forward prop, without calculating any gradient update rule.
        
        Use to get predictions
        
        batchData: more like test/val data
        K: ALWAYS == batchData.shape[0]
        
        '''
        predBatch = []
        for i in range(K):
            self.fprop(batchData[i], mode='loop') #batchTarget[:,i]
            predBatch.append(self.prediction)
        self.predBatch = np.array(predBatch)    
        
    def gradDescentMat(self, batchData, batchTarget):
        # Feed the entire data matrix in as input
        self.fprop(batchData)
        self.bprop_matrix(batchData, batchTarget)




### Part 6

Our training function(s):

In [73]:
def show_error(nn, epoch, train, valid, test):
    '''
    calculattes error in matrix mode
    '''
    # Train
    nn.numData = train[0].shape[0]

    nn.fprop(train[0], mode='matrix')
    training_loss = nn.errorRate(train[1].T, mode='matrix')
    training_err = classErr(np.argmax(train[1], axis = 1), nn.prediction)
    
    # Valid
    nn.numData = valid[0].shape[0]
    nn.fprop(valid[0], mode='matrix') 

    valid_loss = nn.errorRate(valid[1].T, mode='matrix')
    valid_err = classErr(np.argmax(valid[1], axis  =1 ), nn.prediction)
    
    # Test
    nn.numData = test[0].shape[0]

    nn.fprop(test[0], mode='matrix') 
    test_loss = nn.errorRate(test[1].T, mode='matrix')
    test_err = classErr(np.argmax(test[1], axis = 1), nn.prediction)
    
    # Write to log file
    with open('errors.txt', 'a+') as fp:
        line = '{},{},{},{},{},{},{}\n'.format(epoch, training_loss, training_err, 
                                             valid_loss, valid_err, test_loss, test_err)
        fp.write(line)


In [74]:
def train_loop(nn, data, target,  K, num_epoch, fixed = False): 
    '''
    train minibtaches over num_epoch epochs (in a loop)
    also does prediction and error calcualation
    '''
    # Get minibatch
    batchSampler = BatchSampler(data, target, K)
    numBatch = data.shape[0] // K 
    print("num batch in train loop ", numBatch)
    # training loop
    for n in range(num_epoch):
        # Do descent and update params - this is one epoch
        for i in range(numBatch):
            if fixed:
                batchData, batchTarget = batchSampler.get_batch(K)

            elif not fixed:
                batchData, batchTarget = batchSampler.get_batch()
            #difference: another loop here
            nn.gradDescentLoop(batchData, batchTarget.T, K)
            nn.updateParams()
        if n % 100 == 0:
            nn.fpropLoop(data, data.shape[0]) 
            print("Cross-entropy loss at the end of epoch {}: {}".format(n, nn.errorRate(target.T, mode = 'loop')))
            print("classification error at the end of epoch {}: {}".format(n,
                                                    classErr(np.argmax(target, axis = 1), nn.predBatch)))        
    
    # finalized weights, need to fprop and get the error rate 
    # a for loop inside the prop for each elem
    nn.fpropLoop(data, data.shape[0]) 
    print("End of train loop process.")


def train_matrix(nn, data, target, K, num_epoch, fixed=False, valid=None, test=None):
    # Get minibatch
    batchSampler = BatchSampler(data, target, K)
    numBatch = data.shape[0] // K 
    
    print("number of batch in train matrix", numBatch)
    for n in range(num_epoch):
        for i in range(numBatch):
            # Do descent and update params - this is one epoch

            if fixed:
                batchData, batchTarget = batchSampler.get_batch(K)
            elif not fixed:
                batchData, batchTarget = batchSampler.get_batch()
            nn.numData = K
            nn.gradDescentMat(batchData, batchTarget.T)
            
            nn.updateParams()
        if n % 100 == 0:
            print(':)')
            #nn.fprop(batchData, mode = 'matrix') 
            #pred = np.argmax(nn.o_s, axis = 0)
            #print("Cross-entropy loss at the end of epoch {}: {}".format(n, nn.errorRate(batchTarget.T, mode = 'matrix')))
            #print("classification error at the end of epoch {}: {}".format(n,
            #                                        classErr(np.argmax(batchTarget, axis = 1), pred ))) 
        if valid:
            nn.numData = valid[0].shape[0]
            show_error(nn, n, [data, target], valid, test)
    print("End of train matrix process.")

    
    


# random sample NN struct but also adhere to the constraints

In [75]:
class ParamGenerator():
    """
    Generate parameters like number of hidden units, learning rate, N for finite diff, etc
    """
    def __init__(self, seed):
        # for reproducibility
        self.seed = seed
    
    def countParam(self, hiddenDim):
        inputDim = 784
        outputDim = 10
        return (inputDim * hiddenDim[0] + hiddenDim[1] * hiddenDim[0]
                + outputDim * hiddenDim[1] +inputDim + hiddenDim[0] 
                + hiddenDim[1] + outputDim)
    
    def hiddenUnit(self):
        """
        return number of hidden units in range (0.5M, 1M)
        """
        
        constraint = False
        
        # keep generating until something in between 0.5 to 1 M
        while not constraint:

            h1 = np.random.randint(100, 2000)
            h2 = np.random.randint(100, 2000)    
            total_param = self.countParam([h1, h2])
            constraint = (0.5 * 10e5) < plot_test.calculParam() and (10e5) > plot_test.calculParam()
        return (h1, h2)    

    def learningRate(self):
        """
        sample a learning rate
        """
        logLearningRate = np.random.uniform(-7.5, -4.5)
        learningRate = np.exp(logLearningRate)
        return learningRate

In [None]:
# Test to see if logging/plotting works (very simple)
train_data = X_train#[:10000]
train_target = y_train#[:10000]
valid_data = X_valid#[:100]
valid_target = y_valid#[:100]
test_data = X_test#[:100]
test_target = y_test#[:100]
print(train_data.shape)
K = 200
num_epochs = 10
plot_test = neuralNet(train_data.shape[1], (512, 512) , 10, K, 'normal_init')
print("total number of param in plot_test", plot_test.calculParam())

train_matrix(plot_test, train_data, train_target, K, num_epochs, 
             valid=[valid_data, valid_target], test=[test_data, test_target])

(60000, 784)
(512, 784)
total number of param in plot_test 670490
number of batch in train matrix 300


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


:)
y.shape in error rate (10, 60000)


In [None]:
# Plot the learning curves
def plot_learning_curves(log_file):
    with open(log_file, 'r') as fp:
        info = fp.readlines()
        
    epochs = np.arange(len(info))
    
    training_loss, valid_loss, test_loss = [], [], []
    training_err, valid_err, test_err = [], [], []
    
    for line in info:
        split_line = line.split(',')
        training_loss.append(float(split_line[1]))
        training_err.append(float(split_line[2]))
        valid_loss.append(float(split_line[3]))
        valid_err.append(float(split_line[4]))
        test_loss.append(float(split_line[5]))
        test_err.append(float(split_line[6]))
    
    # Plot
    plt.title("average cross entropy loss")
    plt.plot(epochs, training_loss, c='blue', linestyle='solid', label = 'train loss')
    plt.plot(epochs, valid_loss, c='green', linestyle='solid', label = 'valid loss')
    plt.plot(epochs, test_loss, c='orange', linestyle='solid', label = 'test loss')
    plt.xlabel('number of epoch')
    plt.ylabel('loss')
    plt.legend(loc='best')

    plt.show()
    
    plt.title("classification errors")
    plt.plot(epochs, training_err, c='blue', linestyle='dashed', label = 'train error')
    plt.plot(epochs, valid_err, c='green', linestyle='dashed', label = 'valid error')
    plt.plot(epochs, test_err, c='orange', linestyle='dashed', label = 'test error')
    plt.xlabel('number of epoch')
    plt.ylabel('error')
    plt.legend(loc='best')

    plt.show()

### Part 10

> Train your network on the Fashion MNIST dataset. Plot the training/valid/test curves (error and loss as a function of the epoch number, corresponding to what you wrote in a file in the last question). Add to your report the curves obtained using your best hyperparameters, i.e. for which you obtained your best error on the validations et. We suggest 2 plots: the first one will plot the error rate (train/valid/test with different colors, show which color in a legend) and the other one for the averaged loss (on train/valid/test). You should be able to get less than 20% test error.

In [None]:
plot_learning_curves('errors.txt')