# Numpy Implementation 

In [53]:
import numpy as np

 
# sigmoid function

def nonlin(x,deriv=False):

    if(deriv==True):

        return x*(1-x)

    return 1/(1+np.exp(-x))

 

# input dataset

X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
print(f'Shape of Input([No,of observations,Number of features]): {X.shape}')
# output dataset           
y = np.array([[0,0,1,1]]).T
print(f'Shape of Target Output ([No,of observations,Number of neuron in output layer]):{y.shape}')
 
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
 
# initialize weights randomly 
syn0 = np.random.random((3,1))
print(f'Shape of Weight Matrix for Input([No,of input features,Number of output features]): {syn0.shape}')

Shape of Input([No,of observations,Number of features]): (4, 3)
Shape of Target Output ([No,of observations,Number of neuron in output layer]):(4, 1)
Shape of Weight Matrix for Input([No,of input features,Number of output features]): (3, 1)


In [30]:
for i in np.arange(5):

    # forward propagation
    l0 = X

    l1 = nonlin(np.dot(l0,syn0))
    #print(f'Output after {i+1} iteration(l1): {l1.shape}')
    # how much did we miss?

    l1_error = y - l1
    #print(f'Error in each record(l1_error): {l1_error}')

    # multiply how much we missed by the

    # slope of the sigmoid at the values in l1
    #print(f'Derivate after {i+1} iteration: {nonlin(l1,True)}')
    l1_delta = l1_error * nonlin(l1,True)
    #print(f'Nudge factor to each record(l1_delta): {l1_delta}')

    # update weights
    #print(l0.T.shape , l1_delta.shape)
    syn0 += np.dot(l0.T,l1_delta)
    #print(f'Weights after {i+1} iterations(syn0) :{syn0}')

print("Output After Training:")

print(l1)

print(f'Actual Output:\n {y}')

Output After Training:
[[0.06867521]
 [0.05555872]
 [0.95508347]
 [0.94433134]]
Actual Output:
 [[0]
 [0]
 [1]
 [1]]


In [None]:
# 3 features,4 record
# 1 output layer


In [56]:
x = np.array([[0,0,1],[0,1,1],[1,0,1],[1,1,1]])
y = np.array([[0,0,1,1]]).T
w = np.random.random((3,1))
b = np.random.random(1)
x.shape,y.shape,w.shape

((4, 3), (4, 1), (3, 1))

In [71]:
epochs = 100
for i in range(epochs):
    h1 = np.dot(x,w)
    output = nonlin(h1)
    #print(output.shape)
    loss = y - output
    if i // 10 ==0:
        print(f'Loss{i}: {np.sum(np.squeeze(loss))}')
    loss_delta = loss * nonlin(output,deriv=True)
    #print(loss_delta.shape)
    w+= np.dot(x.T,loss_delta)

print(output)
print(y)

Loss0: -0.013743207833137316
Loss1: -0.013731495158920998
Loss2: -0.01371981138260113
Loss3: -0.013708156387188364
Loss4: -0.013696530056349965
Loss5: -0.013684932274405315
Loss6: -0.013673362926320931
Loss7: -0.013661821897706095
Loss8: -0.013650309074807215
Loss9: -0.013638824344505474
[[0.03734394]
 [0.0300676 ]
 [0.9756196 ]
 [0.96967653]]
[[0]
 [0]
 [1]
 [1]]


# Pytorch Implementation

In [200]:
from sklearn import datasets
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

In [198]:
dataset = datasets.load_iris()

In [199]:
X = dataset.data
y = dataset.target

In [202]:
X = scale(X) 

In [203]:
X.shape,y.shape

((150, 4), (150,))

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'X train size: {X_train.shape}')
print(f'X test size: {X_test.shape}')
print(f'y train size: {y_train.shape}')
print(f'y test size: {y_test.shape}')

# Distribution of both classes are roughly equal using train_test_split function
print(Counter(y_train))

X train size: (120, 4)
X test size: (30, 4)
y train size: (120,)
y test size: (30,)
Counter({0: 42, 2: 41, 1: 37})


In [209]:
num_epochs = 10
learning_rate = 0.001
num_features = X.shape[1]
weights = torch.zeros(num_features, 1, dtype=torch.float32)
bias = torch.zeros(1, dtype=torch.float32)

X_train = torch.from_numpy(X_train).type(torch.float32)
y_train = torch.from_numpy(y_train).type(torch.float32)

for epoch in range(num_epochs):        
    # 1. Forwardpropagation:
    # 1a. Affine Transformation: z = \theta x + b
    z = torch.add(torch.mm(X_train, weights), bias)
    # 2a. Sigmoid/Logistic Function: y_hat = 1 / (1 + e^{-z})
    y_hat = 1. / (1. + torch.exp(-z))

    # Backpropagation:
    # 1. Calculate binary cross entropy 
    l = torch.mm(-y_train.view(1, -1), torch.log(y_hat)) - torch.mm((1 - y_train).view(1, -1), torch.log(1 - y_hat))

    # 2. Calculate dl/dz
    dl_dz = y_train - y_hat.view(-1)

    # 2. Calculate partial derivative of cost w.r.t weights (gradients)
    # dl_dw = dl_dz dz_dw = (y_hat - y)(x^T)
    grad = torch.mm(X_train.transpose(0, 1), dl_dz.view(-1, 1))

    # Gradient descent:
    # update our weights and bias with our gradients
    weights += learning_rate * grad
    bias += learning_rate * torch.sum(dl_dz)

    # Accuracy
    total = y_hat.shape[0]
    predicted = (y_hat > 0.5).float().squeeze()
    correct = (predicted == y_train).sum()
    acc = 100 * correct / total 

    # Print accuracy and cost
    print(f'Epoch: {epoch} | Accuracy: {acc.item() :.4f} | Cost: {l.item() :.4f}')

print(f'Weights \n {weights.data}')
print(f'Bias \n {bias.data}')

Epoch: 0 | Accuracy: 35.0000 | Cost: 83.1777
Epoch: 1 | Accuracy: 64.1667 | Cost: 53.8651
Epoch: 2 | Accuracy: 64.1667 | Cost: 29.1477
Epoch: 3 | Accuracy: 65.0000 | Cost: 8.0210
Epoch: 4 | Accuracy: 65.0000 | Cost: -10.3593
Epoch: 5 | Accuracy: 65.0000 | Cost: -26.6443
Epoch: 6 | Accuracy: 65.0000 | Cost: -41.3176
Epoch: 7 | Accuracy: 65.0000 | Cost: -54.7359
Epoch: 8 | Accuracy: 65.0000 | Cost: -67.1630
Epoch: 9 | Accuracy: 65.0000 | Cost: -78.7964
Weights 
 tensor([[ 0.5947],
        [-0.2931],
        [ 0.7068],
        [ 0.7100]])
Bias 
 tensor([0.5349])


In [100]:
features = torch.randn(3,3)
print(f'Number of Inpout features:{features.shape[1]}')
y = torch.tensor([[0,0,1],[1,0,0],[0,1,0]])

Number of Inpout features:3


In [101]:
y

tensor([[0, 0, 1],
        [1, 0, 0],
        [0, 1, 0]])

In [94]:
y.shape[0]

3

In [103]:
n_inputs = features.shape[1]
n_hidden = 2
n_output = 1
w1 = torch.randn(n_inputs,n_hidden)
b1 = torch.randn(n_hidden)
w2 = torch.randn(n_hidden,n_output)
b2 = torch.randn(n_output)

In [90]:
def sigmoid(x):
    return 1/1+ torch.exp(-x)

In [104]:
h1 = sigmoid(torch.matmul(features,w1)+b1)
print(f'Hidden Layer activation {h1}')
out = sigmoid(torch.matmul(h1,w2)+b2)
print(f'Output {out}')

Hidden Layer activation tensor([[ 1.1420, 40.9133],
        [ 1.0873,  1.1440],
        [ 1.4069,  1.0214]])
Output tensor([[8.9033e+23],
        [2.5676e+01],
        [2.9438e+01]])


In [99]:
out[2]

tensor([1.1660])

In [102]:
out[range(y.shape[0]),y]

IndexError: index 1 is out of bounds for dimension 1 with size 1

In [72]:
def model():
    h1 = sigmoid(torch.matmul(features,w1)+b1)
    h2 = torch.matmul(h1,w2)+b2
    return h2

In [None]:
def loss_fnc():
    

In [18]:
lr = 0.5  # learning rate hyperparameter
epochs = 2  # how many epochs to train for

for epoch in range(epochs):  # loop over the data repeatedly
    for ii in range((n - 1) // bs + 1):  # in batches of size bs, so roughly n / bs of them
        start_idx = ii * bs  # we are ii batches in, each of size bs
        end_idx = start_idx + bs  # and we want the next bs entires

        # pull batches from x and from y
        xb = x_train[start_idx:end_idx]
        yb = y_train[start_idx:end_idx]

        # run model
        pred = model(xb)

        # get loss
        loss = loss_func(pred, yb)

        # calculate the gradients with a backwards pass
        loss.backward()

        # update the parameters
        with torch.no_grad():  # we don't want to track gradients through this part!
            # SGD learning rule: update with negative gradient scaled by lr
            weights -= weights.grad * lr
            bias -= bias.grad * lr

            # ACHTUNG: PyTorch doesn't assume you're done with gradients
            #          until you say so -- by explicitly "deleting" them,
            #          i.e. setting the gradients to 0.
            weights.grad.zero_()
            bias.grad.zero_()

TypeError: 'NoneType' object is not callable