# HWK7 PROBLEM 7

### Your goal is to train a convnet with multiple layers on fashion-MNIST and to obtain the lowest error rate possible on the test set. Try various hyperparameter (number of layers, hidden_sizes, etc...). Good luck!

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import randint
import utils
import time

In [2]:
train_data=torch.load('../../data/fashion-mnist/train_data.pt')
train_label=torch.load('../../data/fashion-mnist/train_label.pt')
test_data=torch.load('../../data/fashion-mnist/test_data.pt')
test_label=torch.load('../../data/fashion-mnist/test_label.pt')
print(train_data.size())
print(test_data.size())

torch.Size([60000, 28, 28])
torch.Size([10000, 28, 28])


In [3]:
class convnet(nn.Module):
    def __init__(self):
        super().__init__()
        # block 1 - 1x28x28 to __x14x14
        self.conv1a = nn.Conv2d(1, 56, kernel_size = 3, padding = 1)
        self.conv1b = nn.Conv2d(56, 56, kernel_size = 3, padding = 1)
        self.pool1 = nn.MaxPool2d(2,2)
        
        # block 2 - __x14x14 to __x7x7
        self.conv2a = nn.Conv2d(56, 224, kernel_size = 3, padding = 1)
        self.conv2b = nn.Conv2d(224, 224, kernel_size = 3, padding = 1)
        self.pool2 = nn.MaxPool2d(2,2)
        
        # block 3
        self.conv3a = nn.Conv2d(224, 896, kernel_size = 3, padding = 1)
        self.pool3 = nn.MaxPool2d(7,7)
        
        # linear layers - 
        self.linear1 = nn.Linear(896, 2000)
        self.linear2 = nn.Linear(2000, 10)
        
    def forward(self, x):
        # block 1
        x = self.conv1a(x)
        x = F.relu(x)
        x = self.conv1b(x)
        x = F.relu(x)
        x = self.pool1(x)
        
        # block 2
        x = self.conv2a(x)
        x = F.relu(x)
        x = self.conv2b(x)
        x = F.relu(x)
        x = self.pool2(x)
        
        # block 3
        x = self.conv3a(x)
        x = F.relu(x)
        x = self.pool3(x)
        
        # linear layers
        x = x.view(-1,896)
        x = self.linear1(x)
        x = F.relu(x)
        scores = self.linear2(x)
        
        return scores

In [4]:
device = torch.device("cuda")
net = convnet()

utils.display_num_param(net)
net = net.to(device)

mean = train_data.mean()
std = train_data.std()
mean = mean.to(device)
std = std.to(device)

There are 4215010 (4.22 million) parameters in this neural network


In [5]:
criterion = nn.CrossEntropyLoss()
my_lr = 0.28

bs = 100

In [6]:
def eval_on_test_set():

    running_error=0
    num_batches=0
    
    with torch.no_grad():

        for i in range(0,10000,bs):

            minibatch_data =  test_data[i:i+bs]
            minibatch_label = test_label[i:i+bs]
            
            minibatch_data = minibatch_data.to(device)
            minibatch_label = minibatch_label.to(device)

            inputs = (minibatch_data - mean)/std

            scores = net(inputs.view(bs,1,28,28)) 

            error = utils.get_error(scores , minibatch_label)

            running_error += error.item()

            num_batches+=1


    total_error = running_error/num_batches
    print( 'test error  = ', total_error*100,'percent')

In [7]:
start = time.time()

for epoch in range(20):
    
    if epoch == 7 or epoch == 10 or epoch == 12:
        my_lr = my_lr * 0.65
        
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
    
    running_loss=0
    running_error=0
    num_batches=0
    
    shuffled_indices=torch.randperm(60000)
 
    for count in range(0,60000,bs):
    
        optimizer.zero_grad()
        
        indices=shuffled_indices[count:count+bs]
        minibatch_data =  train_data[indices]
        minibatch_label= train_label[indices]
        
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)

        inputs = (minibatch_data - mean)/std

        inputs.requires_grad_(True)

        scores = net(inputs.view(bs,1,28,28)) 

        loss =  criterion(scores, minibatch_label) 
        
        loss.backward()

        optimizer.step()
        
        
        # compute some stats
        
        num_batches+=1
        
        with torch.no_grad():
            
            running_loss += loss.item()

            error = utils.get_error(scores , minibatch_label)
            running_error += error.item() 
    
    
    # once the epoch is finished we divide the "running quantities"
    # by the number of batches
    
    total_loss = running_loss/num_batches
    total_error = running_error/num_batches
    elapsed_time = time.time() - start
    
    # every 1 epoch we display the stats 
    # and compute the error rate on the test set  
    
    if epoch % 1 == 0: 
    
        print(' ')
        
        print('epoch=',epoch, '\t time=', elapsed_time,
              '\t loss=', total_loss , '\t error=', total_error*100 ,'percent')
        
        eval_on_test_set()

 
epoch= 0 	 time= 21.10438346862793 	 loss= 1.050669418896238 	 error= 39.070001532634095 percent
test error  =  17.210001826286316 percent
 
epoch= 1 	 time= 43.23625946044922 	 loss= 0.3580342998852332 	 error= 13.446668714284895 percent
test error  =  14.27000218629837 percent
 
epoch= 2 	 time= 65.55840349197388 	 loss= 0.27619029181698956 	 error= 10.450002074241638 percent
test error  =  10.45000171661377 percent
 
epoch= 3 	 time= 87.81878185272217 	 loss= 0.23054040390998126 	 error= 8.731668422619501 percent
test error  =  8.840002059936523 percent
 
epoch= 4 	 time= 110.02020215988159 	 loss= 0.19973710803935926 	 error= 7.493335028489431 percent
test error  =  9.080002129077911 percent
 
epoch= 5 	 time= 132.31347465515137 	 loss= 0.17296793062239885 	 error= 6.5500016907850895 percent
test error  =  9.160001814365387 percent
 
epoch= 6 	 time= 154.6114354133606 	 loss= 0.15360305507977803 	 error= 5.750001887480418 percent
test error  =  7.65000206232071 percent
 
epoch= 7