# (Reference) Backprop on MNIST dataset using Fully Connected Layers

### Import Libraries

In [1]:
import torch 
import torch.nn as nn  
from torchvision.datasets import MNIST 
from torchvision import datasets, transforms
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda 
from torch.utils.data import DataLoader 
from torch.optim import Adam 

### Model Architecture, Loss and Optimization

In [2]:
# Define the model architecture.
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 2000),
    nn.ReLU(),
    nn.Linear(2000, 2000),
    nn.ReLU(),
    nn.Linear(2000, 2000),
    nn.ReLU(),
    nn.Linear(2000, 2000),
    nn.ReLU(),
    nn.Linear(2000, 10)
)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

### Device Setup

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device) 

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=2000, bias=True)
  (2): ReLU()
  (3): Linear(in_features=2000, out_features=2000, bias=True)
  (4): ReLU()
  (5): Linear(in_features=2000, out_features=2000, bias=True)
  (6): ReLU()
  (7): Linear(in_features=2000, out_features=2000, bias=True)
  (8): ReLU()
  (9): Linear(in_features=2000, out_features=10, bias=True)
)

### Load Data

In [4]:
# Data loading + transform
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_loader = DataLoader(MNIST("./data/", train=True, download=True, transform=transform), batch_size=50000) 
test_loader = DataLoader(MNIST("./data/", train=False, download=True, transform=transform), batch_size=10000) 

### Train and Test

In [5]:
# Training loop
for epoch in range(15):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = 100 * correct_train / total_train
    print(f'Epoch {epoch+1}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%')

    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total_test += labels.size(0)
            correct_test += (predicted == labels).sum().item()

    test_acc = 100 * correct_test / total_test
    print(f'Test Acc: {test_acc:.2f}%\n')

Epoch 1, Train Loss: 2.2661, Train Acc: 15.07%
Test Acc: 58.33%

Epoch 2, Train Loss: 2.0292, Train Acc: 51.27%
Test Acc: 28.86%

Epoch 3, Train Loss: 2.2187, Train Acc: 32.84%
Test Acc: 51.82%

Epoch 4, Train Loss: 1.4772, Train Acc: 49.63%
Test Acc: 52.42%

Epoch 5, Train Loss: 1.3186, Train Acc: 54.69%
Test Acc: 70.87%

Epoch 6, Train Loss: 0.9296, Train Acc: 71.08%
Test Acc: 82.21%

Epoch 7, Train Loss: 0.6182, Train Acc: 81.56%
Test Acc: 83.35%

Epoch 8, Train Loss: 0.5702, Train Acc: 83.40%
Test Acc: 86.98%

Epoch 9, Train Loss: 0.4851, Train Acc: 86.80%
Test Acc: 89.08%

Epoch 10, Train Loss: 0.4100, Train Acc: 88.79%
Test Acc: 90.93%

Epoch 11, Train Loss: 0.3247, Train Acc: 90.78%
Test Acc: 91.22%

Epoch 12, Train Loss: 0.2993, Train Acc: 91.35%
Test Acc: 92.15%

Epoch 13, Train Loss: 0.2607, Train Acc: 92.43%
Test Acc: 92.81%

Epoch 14, Train Loss: 0.2332, Train Acc: 93.12%
Test Acc: 93.61%

Epoch 15, Train Loss: 0.2131, Train Acc: 93.73%
Test Acc: 94.36%

