In [7]:
import torch
from torch import nn
from d2l import torch as d2l

In [20]:
class AlexNet(nn.Module):
    def __init__(self,num_classes=10):
        super().__init__()
        self.net=nn.Sequential(
            nn.Conv2d(1, 96, kernel_size=11, stride=4),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(256, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(384, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.LazyConv2d(384, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.LazyConv2d(256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Flatten(),
            nn.Linear(6400, 4096), nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096), nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 10))
            #, nn.Softmax()) # A softmax layer seems bring vanishing gradient, why?

    def forward(self, X):
        return self.net(X)

    def layer_summary(self, X_shape):
        X = torch.randn(*X_shape)
        for layer in self.net:
            X = layer(X)
            print(layer.__class__.__name__, 'output shape:\t', X.shape)

In [22]:
X=torch.randn(1, 1, 224, 224)
myAlexNet=AlexNet()
for layer in myAlexNet.net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

Conv2d output shape:	 torch.Size([1, 96, 54, 54])
ReLU output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Conv2d output shape:	 torch.Size([1, 256, 26, 26])
ReLU output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 256, 12, 12])
ReLU output shape:	 torch.Size([1, 256, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 256, 5, 5])
Flatten output shape:	 torch.Size([1, 6400])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1,

In [23]:
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

In [25]:
loss=nn.CrossEntropyLoss()
#optimizer=torch.optim.SGD(model.parameters(), lr=0.01)
optimizer = torch.optim.Adam(myAlexNet.parameters(), lr=0.01)

In [27]:
def myTrain(model, loss, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss=0
        for X, y in train_iter:
            y_hat=model(X)
            #print("y_hat shape: ", y_hat.shape)
            #print("y shape: ", y.shape)
            y_true=torch.zeros(y_hat.shape)
            y_true[range(len(y_hat)), y] = 1
            #print(y_true)
            ls=loss(y_hat,y_true)
            #print("\nls is:")
            #print(ls)
            
            optimizer.zero_grad()
            #print("\nGradients before backward pass:")
            #for name, param in model.named_parameters():
            #    print(f"{name} gradients:\n{param.grad}")
            #    break
            ls.backward()
            #print("\nGradients after backward pass:")
            #for name, param in model.named_parameters():
            #    print(f"{name} gradients:\n{param.grad}")
            #    break
            optimizer.step()
            
    
            running_loss+=ls.item()
            #break
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {ls/len(train_iter):.4f}")

In [28]:
# The train porcess is too slow in CPU, try it in GPU.
myTrain(myAlexNet, loss, optimizer)

KeyboardInterrupt: 