In [1]:
#!pip install datasets

In [2]:
## Please install torch and datasets
import torch
from torchvision.transforms import functional as t
import torch.nn.functional as f
from datasets import load_dataset
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
## Loading our dataset
ds = load_dataset("ylecun/mnist")

In [4]:
## Data splits

X_train_p = ds["train"]["image"]
Y_train = ds["train"]["label"]
X_test_p = ds["test"]["image"]
Y_test = ds["test"]["label"]

In [5]:
## PIL to Tensors

X_train = [t.pil_to_tensor(x) for x in X_train_p]
X_test = [t.pil_to_tensor(x) for x in X_test_p]
X_train = torch.stack(X_train).to(device)
X_test = torch.stack(X_test).to(device)
print(X_train.shape, X_test.shape)

torch.Size([60000, 1, 28, 28]) torch.Size([10000, 1, 28, 28])


In [6]:
## Fixing the shape

X_train = X_train.view(-1, 28, 28)
X_test = X_test.view(-1, 28, 28)
print(X_train.shape, X_test.shape)

torch.Size([60000, 28, 28]) torch.Size([10000, 28, 28])


In [7]:
## Making labels into tensors

Y_train = torch.tensor(Y_train).to(device)
Y_test = torch.tensor(Y_test).to(device)

In [8]:
## Flattening the image as DNN takes flat tensor as input

X_train = X_train.view(-1, 784).float() / 255.0
X_test = X_test.view(-1, 784).float() / 255.0
print(X_train.shape, X_test.shape)

torch.Size([60000, 784]) torch.Size([10000, 784])


In [9]:

import numpy as np
from numpy.linalg import svd



class Linear():
  def __init__(self, input_dims, output_dims, B=True, last=False):
    self.training = True
    self.W = (torch.randn(input_dims, output_dims) * (5/3) / (input_dims**0.5)).to(device) if not last else (torch.randn(input_dims, output_dims) * (5/3) / (input_dims**0.5) * 0.1).to(device)
    if B: self.B = torch.randn(output_dims).to(device) if not last else (torch.randn(output_dims) * 0.1).to(device)
    else: self.B = torch.tensor([]).to(device)

  def __call__(self, x):
    if not torch.equal(self.B, torch.tensor([]).to(device)): self.result = x@self.W + self.B
    else: self.result = x@self.W
    return self.result

  def parameters(self):
    return [self.W] + [self.B]


class Tanh():
  def __init__(self):
    self.training = True
    return None

  def __call__(self, x):
    self.result = torch.tanh(x)
    return self.result

  def parameters(self):
    return []

class Dropout():
  def __init__(self, batch_size, output_dims, rate=0.9):
    self.training = True
    self.rate = rate
    self.factor = (torch.rand(batch_size, output_dims) < self.rate).int().to(device)
    return None

  def __call__(self, x):
    if self.training: self.result = x * self.factor
    else: self.result = x
    return self.result

  def parameters(self):
    return []


def decomposition(A, k=1):

    # SVD
    U, S, VT = svd(A, full_matrices=False)


    # Truncate to rank-k
    U_k = U[:, :k]                # (784 x k)
    S_k = np.diag(S[:k])          # (k x k)
    VT_k = VT[:k, :]              # (k x 10)

    # Factor A_k = L @ R, where L and R are low-rank factors
    sqrt_S_k = np.sqrt(S_k)       # (k x k)
    L = U_k @ sqrt_S_k            # (784 x k)
    R = sqrt_S_k @ VT_k           # (k x 10)

    L_flat = L.flatten()
    R_flat = R.flatten()
    LR_concat = np.concatenate([L_flat, R_flat])

    return LR_concat


In [10]:
n1 = 512
n2 = 256
n3 = 512
n4 = 794
batch_size = 1

updater = [
    Linear(7840, n1), Tanh(), Dropout(batch_size, n1),
    #Linear(n1, n2), Tanh(), Dropout(batch_size, n2),
    #Linear(n2, n3), Tanh(), Dropout(batch_size, n3),
   Linear(n1, 7840, last=True),
    
]

predictor = [
    Linear(784, 10, last=True, B=False)
]

updater_params = [p for layer in updater for p in layer.parameters()]
numparams = 0
for p in updater_params:
    p.requires_grad = True
    numparams += p.numel()


predictor_params = [p for layer in predictor for p in layer.parameters()]
for p in predictor_params:
    p.requires_grad = True
print(numparams)

8036512


In [None]:
## Training loop for updater and predictor
iters = 4000  
alpha = 0.01

for c in range(iters):
    # Using no grad as grads of P at t-1 essentially serve as an input that don't depend on any weight
    with torch.no_grad():
        if predictor[0].W.grad is None: 
            P_grad_t_1 = torch.zeros((1, 7840)).to(device)
        else: 
            P_grad_t_1 = predictor[0].W.grad.detach().view(1, 7840)
    
    for p in predictor_params:
        if p.grad is not None:
            p.grad.zero_()
        else:
            print("None Predictor Model Grads")


    x_p = P_grad_t_1
    for layer in updater:
        x_p = layer(x_p)
    P_w_update = x_p.view(784, 10)

    P_w_t_1 = predictor[0].W.detach()
    predictor[0].W = P_w_t_1 + P_w_update

    print(predictor[0].W.requires_grad)

    predictions = X_train @ predictor[0].W
    loss = f.cross_entropy(predictions, Y_train)
    loss.backward()

    print(f"Grad for predictor[0].W: {'None' if predictor[0].W.grad is None else 'Exists'}")

    for p in updater_params:
        print(f"Grad for updater param: {'None' if p.grad is None else 'Exists'}")
        p.data -= alpha * p.grad
        p.grad.zero_()

    if c % 20 == 0:
        print(f"Iteration {c:4d}, Loss: {loss.item():.6f}")


  if predictor[0].W.grad is None:
  if p.grad is not None:
  print(f"Grad for predictor[0].W: {'None' if predictor[0].W.grad is None else 'Exists'}")


None Predictor Model Grads
None Predictor Model Grads
True
Grad for predictor[0].W: None
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
Iteration    0, Loss: 4.306601
None Predictor Model Grads
None Predictor Model Grads
True
Grad for predictor[0].W: None
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
None Predictor Model Grads
None Predictor Model Grads
True
Grad for predictor[0].W: None
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
None Predictor Model Grads
None Predictor Model Grads
True
Grad for predictor[0].W: None
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
Grad for updater param: Exists
None Predictor Model Grads
None Predictor Model Grads
True
Grad for predictor[0].W: None
Grad for updater param: Exis

KeyboardInterrupt: 

In [None]:
## Training loop for updater and predictor
iters = 4000  
alpha = 0.01

for c in range(iters):
    # Using no grad as grads of P at t-1 essentially serve as an input that don't depend on any weight
    with torch.no_grad():
        if predictor[0].W.grad is None: 
            print("None grad")
            P_grad_t_1 = torch.zeros((1, 7840)).to(device)
        else: 
            print("Normal grad")
            P_grad_t_1 = predictor[0].W.grad.detach().view(1, 7840)
    
    for p in predictor_params:
        if p.grad is not None:
            p.grad.zero_()
        else:
            print("None Loss")


    x_p = P_grad_t_1
    for layer in updater:
        x_p = layer(x_p)
    P_w_update = x_p.view(784, 10)


    # Predictor weights determine final loss. 
    # Predictor weight updates shouldn't depend on weight updates, like in any other neural network?
    P_w_t_1 = predictor[0].W.detach()
    predictor[0].W = P_w_t_1 + P_w_update

    predictions = X_train @ predictor[0].W
    loss = f.cross_entropy(predictions, Y_train)
    loss.backward()

    for p in updater_params:
        p.data -= alpha * p.grad
        p.grad.zero_()


    
    if c % 20 == 0:
        print(f"Iteration {c:4d}, Loss: {loss.item():.6f}")


  if predictor[0].W.grad is None:


None grad
None Loss
None Loss
Iteration    0, Loss: 4.753555
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
Iteration   20, Loss: 7.446398
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None grad
None Loss
None Loss
None gra

KeyboardInterrupt: 

In [None]:
# ## Training loop for updater and predictor
# iters = 4000  
# alpha = 0.01

# for c in range(iters):
#     # Zero updater gradients
#     for p in updater_params:
#         if p.grad is not None:
#             p.grad.zero_()

#     # Get predictor grads
#     with torch.no_grad():
#         if predictor[0].W.grad != None: 
#             print("Normal update")
#             with torch.no_grad():
#                 P_grad_copy = predictor[0].W.grad.detach().flatten().unsqueeze(0)
#         else:
#             print("None update")
#             P_grad_copy = torch.zeros((1, 7840)).to(device)
    

#     # Zero predictor gradients
#     for p in predictor_params:
#         if p.grad is not None:
#             p.grad.zero_()


#     updater_output = P_grad_copy
#     for layer in updater:
#         updater_output = layer(updater_output)
    
#     W_update = updater_output.view((784, 10)) * 0.1
    
#     predictor[0].W.data =  predictor[0].W.data + W_update.data
#     predictions = X_train @ predictor[0].W
#     loss = f.cross_entropy(predictions, Y_train)
#     torch.autograd.set_detect_anomaly(True)
#     # Backward and update
#     loss.backward()
#     for p in updater_params:
#         if p.grad is not None:
#             p.data -= alpha * p.grad

#     if c % 20 == 0:
#         print(f"Iteration {c:4d}, Loss: {loss.item():.6f}")


None update
Iteration    0, Loss: 2.301098
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Iteration   20, Loss: 5.443576
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Iteration   40, Loss: 9.989386
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Normal update
Iteration   60, Loss: 14.686119
Normal update
Normal up

KeyboardInterrupt: 

In [None]:

# ## Training
# iters = 4000
# alpha = 0.01

# for c in range(iters):
#     ## Forward Pass through predictor
#     for layer in predictor:
#       layer.training = False
#     x = X_train
#     print(x.shape)
#     for layer in predictor:
#       x = layer(x)
#     # Loss
#     Loss = f.cross_entropy(x, Y_train)


#     ## Forward Pass through updater
#     for layer in layers:
#       layer.training = True

#     # Full SVD
#     A = predictor[0].W.detach().cpu().numpy()  # 784 x 10
#     LR_concat = decomposition(A, k=1) # 794
#     print("LR_concat shape:", LR_concat.shape)

#     i = torch.stack([torch.tensor(LR_concat).to(device)])
#     print(i.shape)

#     for layer in layers:
#       i = layer(i)
#     print(i.shape)
    
#     L_update = i[:, :784].reshape(784, 1)
#     R_update = i[:, 784:].reshape(1, 10)
#     predictor_W_update = L_update @ R_update


#     ## Weight update for predicter
#     predictor[0].W+=predictor_W_update


#     # Calculating Gradient for updater model
#     for layer in layers:
#       layer.result.retain_grad() # This stores grad of layers like Tanh that have no params to update

#     for p in params:
#         p.grad = None

#     Loss.backward()

#     # Weight Update for updater model
#     for p in params:
#         p.data += -alpha * p.grad



#     if c % (iters/20) == 0:
#         print(Loss)


In [None]:
def accuracy(X, Y, layersv=predictor):
    for layer in layersv:
      layer.training=False
    # Forward
    x = X
    for layer in layersv:
      x = layer(x)
    probs = f.softmax(x, 1)
    answers = x.argmax(1)
    c = 0
    for a, y in zip(answers, Y):
        if a==y: c+=1
    return c / answers.shape[0] * 100

def loss(X, Y, layersv = predictor):
    x = X
    for layer in layersv:
      x = layer(x)
    return f.cross_entropy(x, Y)

print(f"train accuracy: {accuracy(X_train, Y_train)} | test accuracy: {accuracy(X_test, Y_test)}")
print(f"train loss: {loss(X_train, Y_train)} | test loss: {loss(X_test, Y_test)}")

train accuracy: 76.36833333333334 | test accuracy: 77.13
train loss: 1.5593186616897583 | test loss: 1.4697450399398804
