### Implementation of AlexNet

- Architecture: 8 layers, (5 conv then 3 fully connected)
- ReLU activations (found to be superior to tanh)
- local response normalization:
    - normalize against activation of same pixel region from adjacent kernels in kernel list. A bit hacky?
    - Superseded by batch norm, which is used more commonly now
- overlapping pooling: slight performance advantages and less overfitting with overlapping pooling
    - traditional pooling: stride = pooling window side length
    - overlapping pooling: stride < pooling window side length
    - "dilated pooling"? (not a given term, just my term), stride > pooling window side length
- 0.5 dropout for regularization

Training procedure
- batch size 128
- momentum of 0.9
- L2 reg with lambda of 0.0005
- weight init with zero-mean gaussian with std 0.01, all biases init to 1

#### Model Training

In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision

if torch.xpu.is_available():
    device = torch.device('xpu')
else:
    device = torch.device('cpu')

In [2]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

cifar_data_1 = unpickle('cifar-10-python/data_batch_1')
cifar_data_2 = unpickle('cifar-10-python/data_batch_2')
cifar_data_3 = unpickle('cifar-10-python/data_batch_3')
cifar_data_4 = unpickle('cifar-10-python/data_batch_4')
cifar_data_5 = unpickle('cifar-10-python/data_batch_5')

Xtr = np.concatenate([cifar_data_1[b'data'], cifar_data_2[b'data'], cifar_data_3[b'data'], cifar_data_4[b'data']])
ytr = np.concatenate([cifar_data_1[b'labels'], cifar_data_2[b'labels'], cifar_data_3[b'labels'], cifar_data_4[b'labels']])
Xtst = cifar_data_5[b'data']
ytst = cifar_data_5[b'labels']

Xtr = torch.tensor(Xtr.reshape(-1, 3, 32, 32), dtype=torch.float32) / 255.0 #normalize between 0 and 1
Xtst = torch.tensor(Xtst.reshape(-1, 3, 32, 32), dtype=torch.float32) / 255.0

In [4]:
Xtr.mean(dim=(0, 2, 3))

tensor([0.4915, 0.4821, 0.4462])

In [None]:
img = np.transpose(np.reshape(Xtr[22],(3, 32,32)), (1,2,0))
print(ytr[4])

plt.imshow(img)
plt.show()

In [7]:
#create pytorch dataset and dataloader objects
class Cifar10Dataset(Dataset):
    def __init__(self, input_tensor, labels, transform=None):
        if input_tensor.shape[0] != len(labels):
            raise ValueError("Input tensor and labels must have the same number of samples.")
        self.input_tensor = input_tensor
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return self.input_tensor.shape[0]

    def __getitem__(self, idx):
        sample_input = self.input_tensor[idx]
        sample_label = self.labels[idx]

        if self.transform:
            sample_input = self.transform(sample_input)

        return sample_input, sample_label

train_mean = Xtr.mean(dim=(0, 2, 3))
train_std = Xtr.mean(dim=(0, 2, 3))
tr_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.RandomRotation(10),
    torchvision.transforms.Normalize(mean=train_mean, std=train_std),
])
tst_transform = torchvision.transforms.Compose([
    torchvision.transforms.Normalize(mean=train_mean, std=train_std),
])
cifar10_tr = Cifar10Dataset(Xtr, ytr, transform=tr_transform)
cifar10_tst = Cifar10Dataset(Xtst, ytst, transform=tst_transform)

# Create DataLoaders for training and validation
tr_dataloader = DataLoader(cifar10_tr, batch_size=32, shuffle=True, num_workers=0)
tst_dataloader = DataLoader(cifar10_tst, batch_size=32, shuffle=False, num_workers=0)

In [11]:
#implementing alexnet, with a few modifications

class AlexNet(nn.Module):
    def __init__(self, p=0.5):
        super(AlexNet, self).__init__()
        n_kernels = 32
        self.l1 = nn.Sequential(
            nn.Conv2d(3, n_kernels, kernel_size=3, padding=1),
            nn.BatchNorm2d(n_kernels),
            nn.Dropout(p),
            nn.ReLU(),
        )
        self.l2 = nn.Sequential(
            nn.Conv2d(n_kernels, n_kernels*2, kernel_size=3, padding=1),
            nn.BatchNorm2d(n_kernels*2),
            nn.Dropout(p),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        n_kernels *= 2
        self.l3 = nn.Sequential(
            nn.Conv2d(n_kernels, n_kernels*2, kernel_size=3, padding=1),
            nn.BatchNorm2d(n_kernels*2),
            nn.Dropout(p),
            nn.ReLU(),
        )
        n_kernels *= 2
        self.l4 = nn.Sequential(
            nn.Conv2d(n_kernels, n_kernels*2, kernel_size=3, padding=0),
            nn.BatchNorm2d(n_kernels*2),
            nn.Dropout(p),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        n_kernels *= 2
        self.l5 = nn.Sequential(
            nn.Conv2d(n_kernels, n_kernels*2, kernel_size=3, padding=0),
            nn.BatchNorm2d(n_kernels*2),
            nn.Dropout(p),
            nn.ReLU(),
        )
        self.l6 = nn.Sequential(
            nn.Conv2d(n_kernels*2, n_kernels*2, kernel_size=2, padding=0),
            nn.Dropout(p),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        n_kernels *= 2
        self.l7 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p),
            nn.Linear(n_kernels * 2 * 2, n_kernels, bias=True),
            nn.ReLU(),
        )
        self.l8 = nn.Linear(n_kernels, 10, bias=False)

    def forward(self, X):
        out = self.l1(X)
        out = self.l2(out)
        out = self.l3(out)
        out = self.l4(out)
        out = self.l5(out)
        out = self.l6(out)
        out = self.l7(out)
        out = self.l8(out)

        return out

model = AlexNet(p=0.6)
model.to(device) #move to xpu

print('number of parameters: ', sum([p.numel() for p in model.parameters()]))

number of parameters:  3673856


In [12]:
#hyperparams
num_classes = 10
num_epochs = 5
batch_size = 64
learning_rate = 0.005
l2_lambda = 0.05

criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=l2_lambda)

In [13]:
#training loop

train_loss = 0.0
train_correct = 0
losses = []

print('starting training')
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    model.train()

    for i, (data, target) in enumerate(tr_dataloader):
        #move to XPU
        data = data.to(dtype=torch.float32, device='xpu')
        target = target.to(device='xpu')

        #run forward and backprop
        optim.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optim.step()
        losses.append(loss.item())
    
        train_loss += loss.item() * data.size(0)
        _, pred = torch.max(output, 1)
        train_correct += (pred == target).sum().item()
    
    train_loss /= len(tr_dataloader.dataset)
    train_acc = 100.0 * train_correct / len(tr_dataloader.dataset)
    
    test_loss = 0.0
    test_correct = 0
    model.eval()
    
    with torch.no_grad():
        for data, target in tst_dataloader:
            data = data.to(dtype=torch.float32, device='xpu')
            target = target.to(device='xpu')
            output = model(data)
            loss = criterion(output, target)
    
            test_loss += loss.item() * data.size(0)
            _, pred = torch.max(output, 1)
            test_correct += (pred == target).sum().item()
    
    test_loss /= len(tst_dataloader.dataset)
    test_acc = 100.0 * test_correct / len(tst_dataloader.dataset)

    print(f'Epoch {epoch}: \n train loss: {train_loss}, train_acc: {train_acc} \n test_loss: {test_loss}, test_acc: {test_acc}')

starting training
Epoch 0: 
 train loss: 2.129885676383972, train_acc: 17.0475 
 test_loss: 2.2406866771698, test_acc: 19.15
Epoch 1: 
 train loss: 2.04389357881546, train_acc: 17.6925 
 test_loss: 2.2220453674316407, test_acc: 15.63
Epoch 2: 
 train loss: 2.0500806181907656, train_acc: 17.515 
 test_loss: 2.2444885353088377, test_acc: 18.61
Epoch 3: 
 train loss: 2.0393212312698363, train_acc: 17.8325 
 test_loss: 2.2434445457458496, test_acc: 12.09


KeyboardInterrupt: 

In [None]:
plt.plot(np.arange(len(losses)), losses)

### Take 2: Less Pooling and Dropouts
- modeled after: https://nvsyashwanth.github.io/machinelearningmaster/cifar-10/

#### Experiment tracking
- hugely overfitting: final train acc of 25.45, test acc of 16.99. Also, still very low acc overall. Trying: increasing dropout rate, increasing l2_lambda. Also reducing lr from 0.005 to 0.003, due to non-convergence. Model seems to be too small, also, given the low acc, but how to prevent overfitting?
- still hugely overfitting, increasing l2_lambda to 0.1

In [8]:
#https://www.kaggle.com/code/shadabhussain/cifar-10-cnn-using-pytorch

class KaggleCifar10(nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 64 x 16 x 16
    
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 128 x 8 x 8
    
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 256 x 4 x 4
    
            nn.Flatten(), 
            nn.Linear(256*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )
    
    def forward(self, xb):
        return self.network(xb)

knet = KaggleCifar10()
knet.to(device)
print('number of parameters: ', sum([p.numel() for p in knet.parameters()]))

number of parameters:  5851338


In [9]:
#hyperparams
num_classes = 10
num_epochs = 2
batch_size = 64
learning_rate = 0.001
l2_lambda = 0.05

criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(params=knet.parameters(), lr=learning_rate, weight_decay=l2_lambda)

In [10]:
#training loop

train_loss = 0.0
train_correct = 0
losses = []

print('starting training')
for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0
    knet.train()

    for i, (data, target) in enumerate(tr_dataloader):
        #move to XPU
        data = data.to(dtype=torch.float32, device='xpu')
        target = target.to(device='xpu')

        #run forward and backprop
        optim.zero_grad()
        output = knet(data)
        loss = criterion(output, target)
        loss.backward()
        optim.step()
        losses.append(loss.item())
    
        train_loss += loss.item() * data.size(0)
        _, pred = torch.max(output, 1)
        train_correct += (pred == target).sum().item()
    
    train_loss /= len(tr_dataloader.dataset)
    train_acc = 100.0 * train_correct / len(tr_dataloader.dataset)
    
    test_loss = 0.0
    test_correct = 0
    knet.eval()
    
    with torch.no_grad():
        for data, target in tst_dataloader:
            data = data.to(dtype=torch.float32, device='xpu')
            target = target.to(device='xpu')
            output = knet(data)
            loss = criterion(output, target)
    
            test_loss += loss.item() * data.size(0)
            _, pred = torch.max(output, 1)
            test_correct += (pred == target).sum().item()
    
    test_loss /= len(tst_dataloader.dataset)
    test_acc = 100.0 * test_correct / len(tst_dataloader.dataset)

    print(f'Epoch {epoch}: \n train loss: {train_loss}, train_acc: {train_acc} \n test_loss: {test_loss}, test_acc: {test_acc}')

starting training
Epoch 0: 
 train loss: 2.3028072072982786, train_acc: 9.8775 
 test_loss: 2.3026850143432616, test_acc: 9.97
Epoch 1: 
 train loss: 2.3027570476531984, train_acc: 9.9175 
 test_loss: 2.3026883331298826, test_acc: 10.03


In [None]:
plt.plot(np.arange(len(losses)), losses)