### write code to implement backward propagation for cross-entropy loss, only need to support the following type of layers:
- Linear layer with in_dim and out_dim
- ReLU layer
- Softmax layer

Example: 
``` Python
Input:
    n_labels = 10
    layers = ['relu', 'relu', 'softmax']
    weights = [np.random.uniform(-0.01, 0.01, (128,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]
    
    batch_size = 256
    X_train = np.random.rand(batch_size,128)
    y_train = np.random.randint(low=0,high=n_labels,size=batch_size)

Output:
    w_grad
```

In [48]:
import numpy as np
class DNN:
    def __init__(self, n_labels, layers, weights):
        assert len(layers) == len(weights), "Every linear layer must followed by a non-linear activation!"
        self.n_layers = len(layers)
        self.n_labels = n_labels
        self.layers = layers
        self.weights = weights
        self.eps = 1e-100
    
    def forward(self, x):
        self.forward_values = []
        for i in range(self.n_layers):
            w = self.weights[i]
            d = {'input': x}
            x = x @ w
            d['hidden'] = x
            if self.layers[i] == 'relu':
                x = np.maximum(x,0)
            elif self.layers[i] == 'softmax':
                # x = (np.exp(x).transpose()/np.exp(x).sum(axis=1)).transpose() # can cause overflow when x is large
                x_max = x.max(axis=1).reshape(-1,1)
                x = (np.exp(x-x_max).transpose()/np.exp(x-x_max).sum(axis=1)).transpose()
            d['output'] = x
            self.forward_values.append(d)
        return x
    
    def CrossEntropyLoss(self, y_pred, y_true):
        return -np.array([np.log(np.maximum(y_pred[i,y_true[i]], self.eps)) for i in range(len(y_true))]).mean()

    def backward(self, y_true):
        self.grads = []
        for m in range(self.n_layers-1, -1, -1):
            d = {}
            if len(self.grads)==0:
                onehot_y = np.array([np.eye(self.n_labels)[y] for y in y_train])
                d['grad_o'] = -onehot_y/(self.forward_values[-1]['output']+self.eps) # (B, n_labels)
            else:
                d['grad_o'] = self.grads[-1]['grad_h'] @ self.weights[m+1].transpose()  # (B, d_h^m) 
            if self.layers[m] == 'relu':
                d['grad_h'] = (self.forward_values[m]['hidden']>0) * d['grad_o'] # (B, d_h^(m))
            elif self.layers[m] == 'softmax':    
                do_h = np.array([np.diag(row) for row in self.forward_values[m]['output']]) # (B, d_h^m, d_h^m)
                do_h = do_h - self.forward_values[m]['output'][:,:,np.newaxis] * self.forward_values[m]['output'][:,np.newaxis,:] # (B, d_h^m, d_h^m)
                d['grad_h'] = (d['grad_o'][:,np.newaxis,:] * do_h).sum(axis=-1) # (B, d_h^m)

            d['grad_w'] = (self.forward_values[m]['input'][:,:,np.newaxis] * d['grad_h'][:,np.newaxis,:]).sum(axis=0)
            # print(d['grad_w'].shape) # (d_h^(m-1), d_h^m)
            self.grads.append(d)
        return [self.grads[i]['grad_w'] for i in range(len(self.grads)-1,-1,-1)] 
    
    def predict(self, x):
        return self.forward(x)

In [50]:
n_labels = 10
layers = ['relu', 'relu', 'softmax']
weights = [np.random.uniform(-0.01, 0.01, (128,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]

model = DNN(n_labels, layers, weights)

In [51]:
batch_size = 256
X_train = np.random.rand(batch_size,128)
y_train = np.random.randint(low=0,high=2,size=batch_size)

In [52]:
# Training loop
def train_model(model, X_train, y_train, n_iter:int=100, lr: float=1e-3):
    for i in range(n_iter):
        x = model.forward(X_train)
        loss = model.CrossEntropyLoss(x, y_train)
        w_grad = model.backward(y_train)
        for k in range(model.n_layers):
            model.weights[k] -= lr * w_grad[k]
        print(f'Iteration {i+1}: loss={loss}')
    return model

In [53]:
model = train_model(model, X_train, y_train)

Iteration 1: loss=2.302581840761669
Iteration 2: loss=2.3025787061484313
Iteration 3: loss=2.3025754974747974
Iteration 4: loss=2.3025721906269268
Iteration 5: loss=2.3025687467767293
Iteration 6: loss=2.3025651240496594
Iteration 7: loss=2.3025612786530134
Iteration 8: loss=2.302557176646829
Iteration 9: loss=2.3025527854932877
Iteration 10: loss=2.3025480291738605
Iteration 11: loss=2.3025428421467056
Iteration 12: loss=2.302537157954361
Iteration 13: loss=2.3025309004751886
Iteration 14: loss=2.3025239853953927
Iteration 15: loss=2.3025163277446827
Iteration 16: loss=2.302507832384738
Iteration 17: loss=2.302498360992307
Iteration 18: loss=2.3024877817475535
Iteration 19: loss=2.3024759172859692
Iteration 20: loss=2.3024625643592636
Iteration 21: loss=2.3024474539352866
Iteration 22: loss=2.302430328591459
Iteration 23: loss=2.3024108275054878
Iteration 24: loss=2.3023885425380937
Iteration 25: loss=2.3023629628858897
Iteration 26: loss=2.3023334339547197
Iteration 27: loss=2.302299

In [54]:
model.predict(X_train)

array([[0.54589656, 0.44249833, 0.00145189, ..., 0.00143438, 0.00144069,
        0.00146423],
       [0.53285652, 0.44339755, 0.00297046, ..., 0.00293908, 0.00295051,
        0.00299249],
       [0.53848808, 0.44352941, 0.0022496 , ..., 0.00222453, 0.00223358,
        0.00226723],
       ...,
       [0.53349522, 0.443448  , 0.00288425, ..., 0.00285364, 0.00286474,
        0.00290583],
       [0.53970658, 0.44345963, 0.00210591, ..., 0.00208212, 0.00209078,
        0.00212262],
       [0.54325247, 0.44303766, 0.00171515, ..., 0.00169506, 0.00170236,
        0.0017293 ]])

In [55]:
unique, counts = np.unique(y_train, return_counts=True)
counts/counts.sum()

array([0.49609375, 0.50390625])

# test on real dataset

In [102]:
# Load MNIST dataset from torchvision
import torchvision
from torchvision import transforms

train_dataset = torchvision.datasets.MNIST(root='../../data',
                                           train = True,
                                           transform=transforms.ToTensor(),
                                           download=True)
test_dataset = torchvision.datasets.MNIST(root='../../data',
                                          train = False,
                                          transform=transforms.ToTensor())


In [103]:
x_train = train_dataset.data.numpy().reshape(train_dataset.data.shape[0],-1)
y_train = train_dataset.targets.numpy()

x_test = test_dataset.data.numpy().reshape(test_dataset.data.shape[0],-1)
y_test = test_dataset.targets.numpy()
print(x_train.shape, x_test.shape)

(60000, 784) (10000, 784)


In [104]:
idx = np.isin(y_train, [0,1,2])
x_train = x_train[idx][:10000]
y_train = y_train[idx][:10000]

idx = np.isin(y_test, [0,1,2])
x_test = x_test[idx]
y_test = y_test[idx]

In [112]:
n_labels = 10
layers = ['relu', 'relu', 'softmax']
weights = [np.random.uniform(-0.01, 0.01, (784,64)), np.random.uniform(-0.01, 0.01, (64,16)), np.random.uniform(-0.01, 0.01, (16,10))]

model = DNN(n_labels, layers, weights)

In [113]:
unique, counts = np.unique(y_train, return_counts=True)
counts/counts.sum()

array([0.3162, 0.3677, 0.3161])

In [114]:
model = train_model(model, x_train, y_train, lr=1e-6, n_iter=50)

Iteration 1: loss=2.3081007216388674
Iteration 2: loss=2.304129284785334
Iteration 3: loss=2.30225942529753
Iteration 4: loss=2.3006393262618205
Iteration 5: loss=2.2985152648523117
Iteration 6: loss=2.2951000184156043
Iteration 7: loss=2.289144780193859
Iteration 8: loss=2.278128286986896
Iteration 9: loss=2.2561096159487106
Iteration 10: loss=2.2075770729605986
Iteration 11: loss=2.0863484133715007
Iteration 12: loss=1.7616432684962071
Iteration 13: loss=1.2766035064456356
Iteration 14: loss=1.0882893412952794
Iteration 15: loss=1.642467203049152
Iteration 16: loss=3.2795592205798
Iteration 17: loss=2.1605288901326456
Iteration 18: loss=1.8394422767011216
Iteration 19: loss=1.4205901399187493
Iteration 20: loss=1.32398129036869
Iteration 21: loss=1.269218126053006
Iteration 22: loss=1.270931587123667
Iteration 23: loss=1.3558513595382726
Iteration 24: loss=1.5775408947311302
Iteration 25: loss=1.2980087176647788
Iteration 26: loss=1.111015955779886
Iteration 27: loss=0.68277169322981

In [115]:
(model.predict(x_test).argmax(axis=1) == y_test).mean()

0.9825230378137909

In [116]:
model.predict(x_test).argmax(axis=1)

array([2, 1, 0, ..., 0, 1, 2])