### write code to implement backward propagation for cross-entropy loss, only need to support the following type of layers:
- Linear layer with in_dim and out_dim
- ReLU layer
- Softmax layer

Example: 
``` Python
Input:
    n_labels = 10
    layers = ['relu', 'relu', 'softmax']
    weights = [np.random.uniform(-0.01, 0.01, (128,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]
    
    batch_size = 256
    X_train = np.random.rand(batch_size,128)
    y_train = np.random.randint(low=0,high=n_labels,size=batch_size)

Output:
    w_grad
```

In [44]:
import numpy as np
class DNN:
    def __init__(self, n_labels, layers, weights):
        assert len(layers) == len(weights), "Every linear layer must followed by a non-linear activation!"
        self.n_layers = len(layers)
        self.n_labels = n_labels
        self.layers = layers
        self.weights = weights
        self.eps = 1e-100
    
    def forward(self, x):
        # time complexity O(B*size(weights)), space complexity O(B*size(weights))
        self.forward_values = []
        for i in range(self.n_layers):
            w = self.weights[i]
            d = {'input': x}
            x = x @ w
            d['hidden'] = x
            if self.layers[i] == 'relu':
                x = np.maximum(x,0)
            elif self.layers[i] == 'softmax':
                # x = (np.exp(x).transpose()/np.exp(x).sum(axis=1)).transpose() # can cause overflow when x is large
                x_max = x.max(axis=1).reshape(-1,1)
                x = (np.exp(x-x_max).transpose()/np.exp(x-x_max).sum(axis=1)).transpose()
            d['output'] = x
            self.forward_values.append(d)
        return x
    
    def CrossEntropyLoss(self, y_pred, y_true):
        return -np.array([np.log(np.maximum(y_pred[i,y_true[i]], self.eps)) for i in range(len(y_true))]).mean()

    def backward(self, y_true):
        # time complexity O(B*size(weights)), space complexity O(B*size(weights))
        self.grads = []
        for m in range(self.n_layers-1, -1, -1):
            d = {}
            if len(self.grads)==0:
                onehot_y = np.array([np.eye(self.n_labels)[y] for y in y_train])
                d['grad_o'] = -onehot_y/(self.forward_values[-1]['output']+self.eps)/onehot_y.shape[0] # (B, n_labels)
            else:
                d['grad_o'] = self.grads[-1]['grad_h'] @ self.weights[m+1].transpose()  # (B, d_h^m) 
            if self.layers[m] == 'relu':
                d['grad_h'] = (self.forward_values[m]['hidden']>0) * d['grad_o'] # (B, d_h^(m))
            elif self.layers[m] == 'softmax':    
                do_h = np.array([np.diag(row) for row in self.forward_values[m]['output']]) # (B, d_h^m, d_h^m)
                do_h = do_h - self.forward_values[m]['output'][:,:,np.newaxis] * self.forward_values[m]['output'][:,np.newaxis,:] # (B, d_h^m, d_h^m)
                d['grad_h'] = (d['grad_o'][:,np.newaxis,:] * do_h).sum(axis=-1) # (B, d_h^m)

            d['grad_w'] = (self.forward_values[m]['input'][:,:,np.newaxis] * d['grad_h'][:,np.newaxis,:]).sum(axis=0)
            # print(d['grad_w'].shape) # (d_h^(m-1), d_h^m)
            self.grads.append(d)
        return [self.grads[i]['grad_w'] for i in range(len(self.grads)-1,-1,-1)] 
    
    def predict(self, x):
        return self.forward(x)

In [45]:
n_labels = 10
layers = ['relu', 'relu', 'softmax']
weights = [np.random.uniform(-0.01, 0.01, (128,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]

model = DNN(n_labels, layers, weights)

In [46]:
batch_size = 256
X_train = np.random.rand(batch_size,128)
y_train = np.random.randint(low=0,high=2,size=batch_size)

In [48]:
# Sanity check
delta = 1e-7
l, d1, d2 = 0, 10, 5

x = model.forward(X_train)
loss = model.CrossEntropyLoss(x, y_train)
w_grad = model.backward(y_train)

model.weights[l][d1][d2] += delta
x_new = model.forward(X_train)
loss_new = model.CrossEntropyLoss(x_new, y_train)
delta_w = (loss_new - loss)/delta 

assert delta_w!=0, "pick another weight!"
print(np.abs((w_grad[l][d1][d2] - delta_w) / (w_grad[l][d1][d2] + delta_w)))

8.397387882778907e-05


In [49]:
# Training loop
def train_model(model, X_train, y_train, n_iter:int=100, lr: float=1e-2):
    for i in range(n_iter):
        x = model.forward(X_train)
        loss = model.CrossEntropyLoss(x, y_train)
        w_grad = model.backward(y_train)
        for k in range(model.n_layers):
            model.weights[k] -= lr * w_grad[k]
        print(f'Iteration {i+1}: loss={loss}')
    return model

In [53]:
model = train_model(model, X_train, y_train, lr=.1)

Iteration 1: loss=2.3020734511656165
Iteration 2: loss=2.3020424474294456
Iteration 3: loss=2.3020090054559335
Iteration 4: loss=2.301972875590674
Iteration 5: loss=2.301933783964392
Iteration 6: loss=2.3018914225619413
Iteration 7: loss=2.301845434720002
Iteration 8: loss=2.301795424763227
Iteration 9: loss=2.3017409570387874
Iteration 10: loss=2.3016815158269557
Iteration 11: loss=2.3016165106567663
Iteration 12: loss=2.3015452919335244
Iteration 13: loss=2.3014670854718147
Iteration 14: loss=2.301381024268615
Iteration 15: loss=2.30128609583826
Iteration 16: loss=2.301181144675395
Iteration 17: loss=2.3010648484307934
Iteration 18: loss=2.3009356168727986
Iteration 19: loss=2.3007915922327524
Iteration 20: loss=2.300630650776509
Iteration 21: loss=2.300450226039728
Iteration 22: loss=2.300247307733741
Iteration 23: loss=2.3000182917656335
Iteration 24: loss=2.299758926760436
Iteration 25: loss=2.299464023071657
Iteration 26: loss=2.299127291719946
Iteration 27: loss=2.29874119080294

In [54]:
model.predict(X_train)

array([[4.78292938e-01, 5.13304768e-01, 1.05892020e-03, ...,
        1.03853368e-03, 1.02085644e-03, 1.05151952e-03],
       [4.78206084e-01, 5.12804964e-01, 1.13276386e-03, ...,
        1.11115383e-03, 1.09249050e-03, 1.12491555e-03],
       [4.78219733e-01, 5.18028951e-01, 4.73275410e-04, ...,
        4.62969289e-04, 4.54055328e-04, 4.69522772e-04],
       ...,
       [4.78486358e-01, 5.15509945e-01, 7.56977951e-04, ...,
        7.41584078e-04, 7.28287373e-04, 7.51375433e-04],
       [4.78426097e-01, 5.16748756e-01, 6.08554429e-04, ...,
        5.95770410e-04, 5.84721668e-04, 6.03898738e-04],
       [4.78434699e-01, 5.16652219e-01, 6.19630627e-04, ...,
        6.06651783e-04, 5.95425872e-04, 6.14902989e-04]])

In [55]:
unique, counts = np.unique(y_train, return_counts=True)
counts/counts.sum()

array([0.453125, 0.546875])

# test on real dataset

In [18]:
# Load MNIST dataset from torchvision
import torchvision
from torchvision import transforms

train_dataset = torchvision.datasets.MNIST(root='../../data',
                                           train = True,
                                           transform=transforms.ToTensor(),
                                           download=True)
test_dataset = torchvision.datasets.MNIST(root='../../data',
                                          train = False,
                                          transform=transforms.ToTensor())


In [19]:
x_train = train_dataset.data.numpy().reshape(train_dataset.data.shape[0],-1)
y_train = train_dataset.targets.numpy()

x_test = test_dataset.data.numpy().reshape(test_dataset.data.shape[0],-1)
y_test = test_dataset.targets.numpy()
print(x_train.shape, x_test.shape)

(60000, 784) (10000, 784)


In [20]:
idx = np.isin(y_train, [0,1,2])
x_train = x_train[idx][:10000]
y_train = y_train[idx][:10000]

idx = np.isin(y_test, [0,1,2])
x_test = x_test[idx]
y_test = y_test[idx]

In [21]:
n_labels = 10
layers = ['relu', 'relu', 'softmax']
weights = [np.random.uniform(-0.01, 0.01, (784,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]

model = DNN(n_labels, layers, weights)

In [22]:
unique, counts = np.unique(y_train, return_counts=True)
counts/counts.sum()

array([0.3162, 0.3677, 0.3161])

In [26]:
model = train_model(model, x_train, y_train, lr=1e-2, n_iter=50)

Iteration 1: loss=2.2828654064214384
Iteration 2: loss=2.266753305613649
Iteration 3: loss=2.235204747052911
Iteration 4: loss=2.164515117377543
Iteration 5: loss=1.9833774765770236
Iteration 6: loss=1.5382367749367698
Iteration 7: loss=1.0890469132298424
Iteration 8: loss=1.0696233318949682
Iteration 9: loss=2.2422211241312193
Iteration 10: loss=1.5807183686429112
Iteration 11: loss=1.227329649688043
Iteration 12: loss=1.0345105793229221
Iteration 13: loss=1.021437762125589
Iteration 14: loss=0.8858952033305468
Iteration 15: loss=1.0148063734581358
Iteration 16: loss=0.6835801545047403
Iteration 17: loss=0.7000682874515882
Iteration 18: loss=0.7316884581180361
Iteration 19: loss=0.8088265573893504
Iteration 20: loss=0.5922964525410027
Iteration 21: loss=0.5660081459195099
Iteration 22: loss=0.5741177365559067
Iteration 23: loss=0.6969263019849808
Iteration 24: loss=0.7014367701590752
Iteration 25: loss=0.6605993298558682
Iteration 26: loss=0.5046746125192446
Iteration 27: loss=0.52941

In [27]:
(model.predict(x_test).argmax(axis=1) == y_test).mean()

0.984429615506832

In [28]:
model.predict(x_test).argmax(axis=1)

array([2, 1, 0, ..., 0, 1, 2])