In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision

import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm

t = transforms.Compose([transforms.ToTensor(), 
                        transforms.Lambda(lambda img: img.squeeze().reshape(784))])

t_target = lambda label: F.one_hot(torch.tensor(label), num_classes=10)
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=t, target_transform=t_target)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=t, target_transform=t_target)

In [2]:
# img = torch.squeeze(train_dataset[0][0])
# fig, ax = plt.subplots(nrows=1, ncols=1)
# ax.imshow(img, cmap=plt.cm.gray)
%config Completer.use_jedi = False


In [3]:
#Dataloader 
batch_size = 20
train_loader = DataLoader(
                dataset= train_dataset,
                batch_size=batch_size,
                shuffle=False) 
test_loader = DataLoader(
                dataset = test_dataset,
                batch_size=batch_size,
                shuffle=True)


In [15]:
class NeuralNet():
    def __init__(self, LR, batch_size):
        np.random.seed(0)

        self.layer1 = np.random.uniform(-0.5, 0.5, (784, 64))
        self.layer2 = np.random.uniform(-0.5, 0.5, (64, 10))
        self.bias1 = np.zeros((1, 64))
        self.bias2 = np.zeros((1,10))
        self.batch_size = batch_size
        self.LR = LR
        
    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def softmax(self, z):
        ez = np.exp(z)
        sum_ez = np.sum(ez, axis=0)
        return ez / (sum_ez)    
    
    def d_sigmoid(self, x):
        return self.sigmoid(x) * (1 - self.sigmoid(x))
    
    def d_softmax(self, a, y):
        result = np.sum((a-y).T * (np.diag(a.squeeze()) - np.multiply(a, a.T)), axis=1)
        return result
    
    def forward(self, x):
        self.x = x
        self.m1 = np.dot(x, self.layer1) + self.bias1
        self.z1 = self.sigmoid(self.m1)
        self.z2 = np.dot(self.z1, self.layer2) + self.bias2
        self.y_pred = self.softmax(self.z2)
        return self.y_pred
    
    def backpass(self, y):
        d_pred = np.empty((self.batch_size, 10))
        for i in range(self.batch_size):
            d_pred[i] = self.d_softmax(self.y_pred[i], y[i])
        
        d_m1 = self.d_sigmoid(self.m1) * (np.dot( d_pred, self.layer2.T))
        d_layer1 = 1/self.batch_size * np.dot(self.x.T, d_m1)
        d_layer2 = 1/self.batch_size * np.dot(self.z1.T, d_pred)
        
        self.layer1 = self.layer1 - self.LR * d_layer1
        self.layer2 = self.layer2 - self.LR * d_layer2
#         self.bias1 = self.bias1 - self.LR * d_bias1
#         self.bias2 = self.bias2 - self.LR * d_bias2
        
    
    def mse(self, y, pred):
        return np.sum(np.sum((pred - y)**2, axis=1))/self.batch_size 

In [18]:
# Training
def training(EPOCHS = 100, print_every=10):
    for idx , epoch in enumerate(range(EPOCHS)):
        total_loss = 0.0
        for x,y in tqdm(train_loader):
            x,y = x.numpy(), y.numpy()
            pred = net.forward(x)

            loss = net.mse(y, pred)
            
#             print(net.layer1, net.layer2)
            net.backpass(y)
            total_loss += loss
        
        print(f"Epoch {epoch} has loss {total_loss}")
        
LR = 0.1
net = NeuralNet(LR, batch_size)
training()

100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 419.46it/s]


Epoch 0 has loss 2667.640291920504


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 415.67it/s]


Epoch 1 has loss 2659.8201384806894


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 420.64it/s]


Epoch 2 has loss 2720.450979182522


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 425.63it/s]


Epoch 3 has loss 2595.448659116362


  ez = np.exp(z)
100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 414.47it/s]


Epoch 4 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 419.14it/s]


Epoch 5 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 424.96it/s]


Epoch 6 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 421.18it/s]


Epoch 7 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 423.05it/s]


Epoch 8 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 414.66it/s]


Epoch 9 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 420.50it/s]


Epoch 10 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 424.45it/s]


Epoch 11 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:06<00:00, 429.97it/s]


Epoch 12 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 413.21it/s]


Epoch 13 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 413.44it/s]


Epoch 14 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 414.10it/s]


Epoch 15 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 412.14it/s]


Epoch 16 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 418.39it/s]


Epoch 17 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 417.44it/s]


Epoch 18 has loss nan


100%|██████████████████████████████████████| 3000/3000 [00:07<00:00, 416.77it/s]


Epoch 19 has loss nan


 74%|███████████████████████████▉          | 2209/3000 [00:05<00:01, 401.97it/s]


KeyboardInterrupt: 

In [122]:
# Testing
def testing(test_loader, net):
    acc = 0
    total_correct = 0
    for x, y in test_loader:
        output = net.forward(x)
        prediction = torch.argmax(output)
        correct_predictions_batch = torch.count_nonzero(torch.eq(prediction, y))
        total_correct = total_correct + correct_predictions_batch
    return total_correct / len(test_loader)

In [23]:
testing(test_loader, net)

  return 1/(1 + np.exp(-x))


tensor(2.5600)

In [None]:
#     def __init__(self, LR, batch_size):
#         np.random.seed(0)
#         self.layer1 = np.random.uniform(-0.5, 0.5, (784, 64)).astype(np.float64)
#         self.layer2 = np.random.uniform(-0.5, 0.5, (64, 10)).astype(np.float64)
#         self.LR = LR
#         self.batch_size = batch_size

#     def softmax(self, x):
#         denominators = torch.sum(np.exp(x), 1)
#         result = torch.div(np.exp(x.T), denominators)
#         return result.T
    
#     def sigmoid(self, x):
#         return 1/(1 + np.exp(-x))
        
#     def der_softmax(self):
#         softmax_d = np.empty((self.batch_size, 10))
        
#         for elem in range(self.batch_size):
#             SM = self.softmax_result.T[elem].reshape((-1,1))
#             jac = np.diagflat(self.softmax_result.T[elem]) - np.dot(SM, SM.T)
#             softmax_d[elem] = np.sum(jac.numpy(), axis=1)
# #         result = torch.sum(softmax_d, 0).float()
#         print(softmax_d)
#         return torch.Tensor(softmax_d).float()
    
#     def der_sigmoid(self, x):
#         xx = (x * (1 - x)).float()
#         return xx.float()
      
#     def forward(self, x):
#         self.input = x.clone()
        
#         x = x @ self.layer1
        
#         x = self.sigmoid(x)
        
#         self.z1 = x.clone()
#         x = x @ self.layer2
        
#         x = self.softmax(x)
        
#         self.softmax_result = x.clone().float().T
#         return x
    
#     def der_loss_wrt_preds(self, gt, outputs):
#         result = (-2/torch.numel(outputs)) * (gt - outputs)
#         return result.float()
    
#     def backpass(self, loss, outputs, gt):

# #         delta_w1 = np.dot(self.input.T.float() , (self.der_loss_wrt_preds(gt, outputs) * (self.der_softmax() @ self.layer2.T.astype(np.float64))) * self.der_sigmoid(self.z1) 
#         inter = self.der_loss_wrt_preds(gt, outputs) * self.der_softmax()
# #         print( self.der_loss_wrt_preds(gt, outputs))
#         delta_zz = torch.Tensor(np.dot(inter, self.layer2.T))  * self.der_sigmoid(self.z1)
#         delta_w1 = np.dot(self.input.T, delta_zz)
#         delta_w2 = self.z1.T.float() @ (self.der_loss_wrt_preds(gt, outputs) * self.der_softmax().float()) 
#         print(delta_w1)
#         print(delta_w2)
#         self.layer1 = torch.Tensor(self.layer1) - self.LR * delta_w1
#         self.layer2 = torch.Tensor(self.layer2) - self.LR * delta_w2


#     def mean_sq_error(self, true, prediction):
#         error = np.square(np.subtract(true, prediction)).mean()
#         return error 