In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
root = "C:/Users/Zachary/Desktop/coding/NN_from_scratch/"
train_df = pd.read_csv(root + "train.csv")

data = np.array(train_df)
np.random.shuffle(data)
data = data.T
X = data[1:]
X = X/255.
Y = data[0]
print(X.shape)
print(Y.shape)

val_X = X[:,:800]
val_Y = Y[:800]
train_X = X[:,800:]
train_Y = Y[800:]
k = 41200
print(train_X.shape, train_Y)

(784, 42000)
(42000,)
(784, 41200) [8 5 4 ... 2 1 3]


In [4]:
def OHE(Y):
    Y_OHE = np.zeros((Y.size, 10)) #k rows, 10 columns
    for i in range (Y.size):
        dist = np.zeros(10)
        dist[Y[i]] = 1
        Y_OHE[i] = dist
    Y_OHE = Y_OHE.T #k columns(data), 10 rows(labels)
    return Y_OHE
#IE: 3 ==> [0, 0, 0, 1, 0, 0, 0, 0, 0, 0].


def init():
    w1 = np.random.rand(10, 784) - 0.5 #-0.5 to create random value from -0.5 to 0.5
    b1 = np.random.rand(10, 1) -0.5
    #result of dot pdt: (10, 1)
    w2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return(w1, b1, w2, b2)

def reLU(Z): #(10, k)
    return Z * (Z>0)

def softmax(Z):
    return np.exp(Z)/sum(np.exp(Z))

def fwd_prop(X, w1, b1, w2, b2): 
    Z1 = w1.dot(X) + b1 #(10, k)
    A1 = reLU(Z1) #(10, k)
    Z2 = w2.dot(A1) + b2 #(10, k)
    Out = softmax(Z2) #(10, k)
    return Z1, A1, Z2, Out

def deriv_reLU(Z): #deriv wrt Z = 0 when Z <= 0, 1 when Z > 0
    return Z>0

def back_prop(Z1, A1, Z2, Out, w1, w2, X, Y): 
    Y_OHE = OHE(Y)
    #main goal: differentiate cost with respect to weights
    #chain rule: dC/dw2 = dZ2/dw2 * dCdZ2

    dZ2 = Out - Y_OHE
    dw2 = 1 / k * np.dot(dZ2, A1.T)# (10, 10)
    db2 = 1 / k * np.sum(dZ2)

    dZ1 = np.dot(w2.T, dZ2) * deriv_reLU(Z1)
    dw1 = 1 / k * np.dot(dZ1, X.T) #(10, 784)
    db1 = 1 / k * np.sum(dZ1)

    return dw1, db1, dw2, db2

def update_weights(w1, b1, w2, b2, dw1, db1, dw2, db2, lr):
    w1 = w1 - lr*dw1
    w2 = w2 - lr*dw2
    b1 = b1 - lr*db1
    b2 = b2 - lr*db2
    return w1, b1, w2, b2

In [5]:
def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def predict(Out):
    return np.argmax(Out, 0)

def train(X, Y, lr, epochs):
    W1, b1, W2, b2 = init()
    for i in range(epochs):
        Z1, A1, Z2, Out = fwd_prop(X, W1, b1, W2, b2)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, Out, W1, W2, X, Y)
        W1, b1, W2, b2 = update_weights(W1, b1, W2, b2, dW1, db1, dW2, db2, lr)
        if i % 10 == 0:
            print("Epoch: ", i)
            predictions = predict(Out)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

In [6]:
print(k)
w1, b1, w2, b2 = train(train_X, train_Y, 0.1, 500)

41200
Epoch:  0
0.15487864077669902
Epoch:  10
0.23116504854368933
Epoch:  20
0.31679611650485434
Epoch:  30
0.3944660194174757
Epoch:  40
0.4563349514563107
Epoch:  50
0.5062378640776699
Epoch:  60
0.5448300970873786
Epoch:  70
0.5763106796116505
Epoch:  80
0.6028155339805825
Epoch:  90
0.6255825242718447
Epoch:  100
0.6448543689320388
Epoch:  110
0.660242718446602
Epoch:  120
0.6747330097087378
Epoch:  130
0.6870388349514563
Epoch:  140
0.698252427184466
Epoch:  150
0.7086407766990291
Epoch:  160
0.717378640776699
Epoch:  170
0.7265048543689321
Epoch:  180
0.7338834951456311
Epoch:  190
0.7409466019417476
Epoch:  200
0.7472087378640777
Epoch:  210
0.7529611650485437
Epoch:  220
0.7585679611650485
Epoch:  230
0.763883495145631
Epoch:  240
0.7687864077669903
Epoch:  250
0.7734466019417475
Epoch:  260
0.7774757281553398
Epoch:  270
0.781626213592233
Epoch:  280
0.7856553398058252
Epoch:  290
0.7893203883495146
Epoch:  300
0.7929611650485436
Epoch:  310
0.7959223300970873
Epoch:  320
0.7

In [7]:
def predict_unlabelled(X, w1, b1, w2, b2):
    _, _, _, Out = fwd_prop(X, w1, b1, w2, b2) #(10, k)
    preds = np.argmax(Out, 0)
    return preds

In [8]:
test_df = pd.read_csv(root + "test.csv")
test_X = np.array(test_df).T
test_X = test_X/255.
print(test_X.shape)

(784, 28000)


In [9]:
test_preds = predict_unlabelled(test_X, w1, b1, w2, b2)
print(test_preds)

[2 0 4 ... 3 9 2]
