# ****Step 1: Import Modules****
To begin, we need to import all of the necessary modules for our project. Run the code cell below to import the various pytorch modules, pretrained models, etc. that we will need.

In addition, we will also attempt to use a GPU device if we can get access, as that will significantly speed up the runtime of training our models.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import PIL

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Models to import
from torchvision.models import resnet18, ResNet18_Weights
from torchvision.models import resnet152, ResNet152_Weights
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
from torchvision.models import densenet201, DenseNet201_Weights

device = torch.device(1)
print(torch.cuda.get_device_name(device))

# ****Step 2: Get Data****
Next step is to get the data needed for training and testing. We define a method called `getData` below, which will create DataLoaders for the testing and training dataset and return them. We need to split our training directory into a 80/20 split, where 80% of the data will be used for training, and 20% of the data will be used for testing. 

By default, we will use a batch size of 64 and an image size of 3 x 224 x 224 for training set, although this may be changed as we fine-tune our models.

In [None]:
def getData(batch = 64, shuffle = True, transform_train = None, transform_test = None, size = 224):
    if (transform_train is None):
        transform_train = transforms.Compose([
            transforms.Resize((size, size)),
            transforms.ToTensor(),
        ])
    
    if (transform_test is None):   
        transform_test = transforms.Compose([
            transforms.Resize((size, size)),
            transforms.ToTensor(),
        ])

    trainset = torchvision.datasets.ImageFolder(root='/kaggle/input/birds23wi/birds/train', transform = transform_train)
    testset = torchvision.datasets.ImageFolder(root='/kaggle/input/birds23wi/birds/train', transform = transform_test)


    # Split the trainset into testing and training sets, we will use a manual seed for consistency
    trainingSet , _ = torch.utils.data.random_split(trainset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))
    _ , testingSet = torch.utils.data.random_split(testset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))
    
    
    trainLoader = torch.utils.data.DataLoader(trainset, batch_size=batch, shuffle=True, num_workers=2)
    testLoader = torch.utils.data.DataLoader(testingSet, batch_size=1, shuffle=False, num_workers=2)
    
    finalSet = torchvision.datasets.ImageFolder(root='/kaggle/input/birds23wi/birds/test', transform=transform_test)
    finalLoader = torch.utils.data.DataLoader(finalSet, batch_size=1, shuffle=False, num_workers=2)
    
    classes = open("/kaggle/input/birds23wi/birds/names.txt").read().strip().split("\n")
    class_to_idx = trainset.class_to_idx
    idx_to_class = {int(v): int(k) for k, v in class_to_idx.items()}
    idx_to_name = {k: classes[v] for k,v in idx_to_class.items()}
    
    return {'train': trainLoader, 'test': testLoader, 'final': finalLoader, 'to_class': idx_to_class, 'to_name':idx_to_name}

data = getData()

# ****Step 3: Visualize Data****
We can visualize a subset of our images by calling the iterator on the training set's DataLoader. We can also see the size of the images, which should be 3 x 224 x 224 at this point. 

In [None]:
dataiter = iter(data['train'])
images, labels = next(dataiter)
images = images[:8]
print(images.size())

def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# show images
imshow(torchvision.utils.make_grid(images))

# ****Step 4: Training and Prediction****
The following `train` function defines a training procedure for the neural network. The training function takes in an optimizer, which determines the decay, momentum, and learning rate of the model. Note that the default optimizer is stochastic gradient descent with parameters lr=0.01, momentum=0.9, decay=0.0005. It also takes in the network to train and the DataLoader with the prepared data. This function will also save checkpoints to the file path given by checkpoint_path, so that the model can be reloaded after training easily. 

In [None]:
# Function to train model
def train(net, dataloader, epochs=1, start_epoch=0, optimizer = None, 
          verbose=1, print_every=10, state=None, schedule={}, checkpoint_path=None):
    net.to(device)
    net.train()
    losses = []
    criterion = nn.CrossEntropyLoss()
    # We will use SGD for the default optimizer
    if (optimizer is None):    
        optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)

    # Load previous training state
    if state:
        net.load_state_dict(state['net'])
        optimizer.load_state_dict(state['optimizer'])
        start_epoch = state['epoch']
        losses = state['losses']

    # Fast forward lr schedule through already trained epochs
    for epoch in range(start_epoch):
        if epoch in schedule:
            print ("Learning rate: %f"% schedule[epoch])
            for g in optimizer.param_groups:
                g['lr'] = schedule[epoch]

    for epoch in range(start_epoch, epochs):
        sum_loss = 0.0

        # Update learning rate when scheduled
        if epoch in schedule:
            print ("Learning rate: %f"% schedule[epoch])
            for g in optimizer.param_groups:
                g['lr'] = schedule[epoch]

        for i, batch in enumerate(dataloader, 0):
            inputs, labels = batch[0].to(device), batch[1].to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()  # autograd magic, computes all the partial derivatives
            optimizer.step() # takes a step in gradient direction

            losses.append(loss.item())
            sum_loss += loss.item()

            if i % print_every == print_every-1:    # print every 10 mini-batches
                if verbose:
                    print('[%d, %5d] loss: %.3f' % (epoch, i + 1, sum_loss / print_every))
                sum_loss = 0.0
        if checkpoint_path:
            state = {'epoch': epoch+1, 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'losses': losses}
            torch.save(state, checkpoint_path + 'checkpoint-%d.pkl'%(epoch+1))
    return losses

Next, we define some auxilary functions to help analyze our models. 

The ```predict``` function will be used to create a csv file
with the predictions of the given network on the give dataset. This will be used to create the final ```submissions.csv``` file that will
have the predictions for the testing set.

The ```accuracy``` function will return the accuracy of the given network on the given dataloader. It will return a float
of the proportion of correct predictions.

The ```smooth``` function will simply smooth out the given list of losses using convolution. It is a utility function to help
visualize graphs more smoothly.

In [None]:
# Function to predict data in given DataLoader. Will output results
# to a csv file titled with the give ofname
def predict(net, dataloader, ofname):
    out = open(ofname, 'w')
    out.write("path,class\n")
    net.to(device)
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (images, labels) in enumerate(dataloader, 0):
            if i%100 == 0:
                print(i)
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            fname, _ = dataloader.dataset.samples[i]
            out.write("test/{},{}\n".format(fname.split('/')[-1], data['to_class'][predicted.item()]))
    out.close()

# Function to get the accuracy of the given network on the given dataloader.
# Returns a single float representing the proportion of correct
# predictions.
def accuracy(net, dataloader):
    net.to(device)
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (images, labels) in enumerate(dataloader, 0):
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    net.train()
    return correct/total

# Function to smooth losses before graphing
def smooth(x, size):
    return np.convolve(x, np.ones(size)/size, mode='valid')

# ****Step 5: Determine Architecture****

<font size="4"><b>Baseline: ResNet18</b></font>

Finally, after all the setup, we can begin training our models. The first step we will take is to determine the best architecture for our model. First, let's have a baseline model that we can compare our other models against.

We can use ResNet18 as our baseline, which is a convolutional network with residual connections. We will use the weights that have already been pretrained on the ImageNet dataset, so that we just need to finetune on our birds dataset.

To save GPU compute, we will train for just 3 epochs. 

The following code will train the model and output the losses, as well as graph the losses and output the predictions on the testing dataset.

In [None]:
# Save checkpoints to directory
checkpoint_path = '/kaggle/working/resnet18_1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# Train ResNet18 model
resnet18 = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
resnet18.fc = nn.Linear(resnet18.fc.in_features, 555)
losses = train(resnet18, data['train'], epochs=3, print_every=10, checkpoint_path=checkpoint_path)

# Print loss graph and make predictions
plt.plot(smooth(losses,50))

print(accuracy(resnet18, data['test']))

On our first run through, we got a loss of 1.208 after 3 epochs!

In addition, our testing accuracy is 0.50900. That's a pretty good score, but we could probably do better with a different architecture. For this project, we will test three different neural network architectures pretrained on ImageNet: DenseNet, ResNet152, and EfficientNet.

<font size="4"><b>Testing: ResNet152</b></font>

Next, we will try the ResNet152 architecture, which is a residual CNN with 152 layers! 

In [None]:
# Save checkpoints to directory
checkpoint_path = '/kaggle/working/resnet152_1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# Train ResNet152 model
resnet152 = resnet152(weights=ResNet152_Weights.IMAGENET1K_V1)
resnet152.fc = nn.Linear(resnet152.fc.in_features, 555)
losses = train(resnet152, data['train'], epochs=3, print_every=10, checkpoint_path=checkpoint_path)

# Print loss graph and make predictions
plt.plot(smooth(losses,50))

# Print the accuracy of the model on the testing set
accuracy(resnet152, data['test'])

We had a loss of 1.074 and a testing accuracy of 0.4542271784232365! Interestingly, our testing accuracy decreased compared to ResNet18, even though ResNet152 is far more powerful. This may be because ResNet152 has far more layers, so it requires more training time to converge to the optimum. Unfortunately, our GPU compute time is limited, so we will leave ResNe152 as it is.

<font size="4"><b>Testing: DenseNet</b></font>

Let's try the DenseNet architecture this time.

In [None]:
# Save checkpoints to directory
checkpoint_path = '/kaggle/working/densenet_1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# Train DenseNet201 model
densenet = densenet201(weights=DenseNet201_Weights.IMAGENET1K_V1)
densenet.fc = nn.Linear(densenet.fc.in_features, 555)
losses = train(densenet, data['train'], epochs=3, print_every=10, checkpoint_path=checkpoint_path)

# Print loss graph and make predictions
plt.plot(smooth(losses,50))

accuracy(densenet, data['test'])

On our runthrough, DenseNet had a loss of 1.032 after 3 epochs and a testing accuracy of 0.5272302904564315! That's a huge improvement over both ResNet18 and ResNet152. However, we should also try EfficientNet next, since this particular model is both small and has a very high Top 1 accuracy on ImageNet.

<font size="4"><b>Testing: EfficientNet</b></font>

Finally, let's try the EfficientNet architecture.

In [None]:
# Save checkpoints to directory
checkpoint_path = '/kaggle/working/efficentnet_1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# Train EfficentNet2 model
effnet = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
effnet.classifier[1] = nn.Linear(effnet.classifier[1].in_features, 555)
losses = train(effnet, data['train'], epochs=3, print_every=10, checkpoint_path=checkpoint_path)

# Print loss graph and make predictions
plt.plot(smooth(losses,50))

accuracy(effnet, data['test'])

Wow! We had a loss of 0.838 and a testing accuracy of 0.649896265560166! That's a massive improvement over DenseNet, and clearly the best model out of all the models we tested. We will choose the EfficientNet model for the next portion of experiments, and it will be the final model that we use to train the classifier.

# ****Step 6: Data Augmentation****

<font size="4"><b>Introduction</b></font>

Next up! Data augmentation. Our dataset is fairly small, so we can artificially increase the data size by using data augmentation. There are many different techniques possible, and in this step we will consider a few popular techniques that have good success on image classification tasks. 

To help visualize the augmentation, we will define some utility functions, ```VisualizeTransform``` and ```DefaultTransform```. ```VisualizeTransform``` will be used to visualize the transformed image. The function will take in a transform and graph a random transformed image from the given dataloader. The image will be sampled from the given rootDir, which must be a directory with at least 1 training image. 

Next, ```DefaultTransform``` will return a default transform that simply resizes the image to 224 x 224. This is simply to help visualize the transformations and compare against the default.

In [None]:
# Default Transform
def DefaultTransform(size = 224):
    defaultTransform = transforms.Compose([
        transforms.Resize((size,size)),
        transforms.ToTensor()
    ])
    return defaultTransform

# Visualize Transform
def VisualizeTransform(transform = DefaultTransform(), rootDir = '/kaggle/working/visualize_img'):
    beforeImage = torchvision.datasets.ImageFolder(root=rootDir, transform=DefaultTransform())
    beforeLoader = torch.utils.data.DataLoader(beforeImage, batch_size=1, shuffle=True, num_workers=2)

    afterImage = torchvision.datasets.ImageFolder(root=rootDir, transform=transform)
    afterLoader = torch.utils.data.DataLoader(afterImage, batch_size=1, shuffle=True, num_workers=2)

    beforeiter = iter(beforeLoader)
    before, beforeLabel = next(beforeiter)
    
    afteriter = iter(afterLoader)
    after, afterLabel = next(afteriter)
    
    def imshow(img):
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()
        
    imshow(torchvision.utils.make_grid(before))
    imshow(torchvision.utils.make_grid(after))
    

<font size="4"><b>Random Flip and Rotation</b></font>

First, we will consider a simple technique, the linear transformation. In particular, we will consider the horiziontal flip, the vertical flip, and the rotation as our primary transformations. 

The structure of the dataset matters here. Specifically, we may not want a vertical flip, since birds are generally not viewed upside down. However, a horizontal flip and some rotation may be appropriate to make our model more robust against noise.

To implement flipping and rotation, we define the following function, ```RandomFlip```. This function will take in four parameters, horP, verP, rotDeg, and rotP. The horP and verP will determine the probability of horizontal and vertical flips, respectively. Then, rotP will determine the probability of rotation, and rotDeg will determine by how much the image is rotated. 

In [None]:
# Function for returning a Transform with three possible transforms:
# - vertical flip
# - horizontal flip
# - rotational flip
def RandomFlip(horP = 1, verP = 0, rotDeg = 3, rotP = 0.5):        
    retTransform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=horP),
        transforms.RandomVerticalFlip(p=verP),
    ])
    
    rotationProb = np.random.ranf()
    if (rotationProb < rotP):
        retTransform = transforms.Compose([
            retTransform,
            transforms.RandomRotation(degrees = rotDeg)      
        ])
        
    return retTransform

flip = transforms.Compose([
    transforms.Resize((224,224)),
    RandomFlip(),
    transforms.ToTensor()
])

VisualizeTransform(transform = flip)

<font size="4"><b>Random Occlusion</b></font>

According to this paper: https://arxiv.org/pdf/1708.04896.pdf, another technique that works well with image classification is Random Occlusion. This is a technique that randomly chooses a box in the image to cover with noisy pixels. The idea is that it will make the network more robust to occlusion in the testing dataset, as it will have been trained on images that already have various components of the object covered.

To implement random occlusion, we define the following function, ```RandomOcclusion```. This function will take in 5 parameters: p, r1, r2, sl, sh. The p determines the probability for each image that occlusion is applied at all. Then, r1 and r2 is the range in which a random aspect ratio R will be chosen, where R is the ratio between the height and width of occlusion. Finally, sl and sh is the range in which a random value S will be chosen, where S is the proportion of the total area of the image that is covered.


In [None]:
def RandomOcclusion(p = 0.5, r1 =0.3, r2 = 3.33, sl = 0.02, sh = 0.3):
    def image_occlude(image):
        # With probability p, do not transform image
        if np.random.ranf() < p:
            return image
        
        # Choose random Re and Se         
        w = image.width
        h = image.height
                
        randS = np.random.ranf() * (sh - sl) + sl
        Se = h * w * randS
        Re = np.random.ranf() * (r2 - r1) + r1
        
        He = int(np.sqrt(Se * Re))
        We = int(np.sqrt(Se / Re))
        
        # Verify that the area is no more than 40%
        # of total image, otherwise pass
        if (We * He > w * h * 0.4):
            return image
        
        # Select random pixel and write box
        x = np.random.randint(0, max(w - We, 2))
        y = np.random.randint(0, max(h - He, 2))
        if (x + We <= w and y + He <= h):
            for i in range(x, x + We):
                for j in range(y, y + He):
                    image.putpixel((i,j),(np.random.randint(0, 256), np.random.randint(0, 256), np.random.randint(0, 256)))
                    
        return image
    
    totalTransform = transforms.Compose([
        transforms.Lambda(image_occlude),
    ])
    
    return totalTransform

occlusion = transforms.Compose([
    transforms.Resize((224,224)),
    RandomOcclusion(),
    transforms.ToTensor()
])
VisualizeTransform(transform = occlusion)

<font size="4"><b>Random Noise</b></font>

Another technique that might help our model be come more robust is adding some random noise. This means that we will change each pixel by some slight offset randomly. This might make our model overfit less on noisy pixels found in the training set.

To implement random noise, we define the following function, ```RandomNoise```. This function will take in 3 parameters: p, r1, r2. The p determines the probability for each pixel that noise will be added. Then, a value R will be chosen uniformly from the range r1 to r2. This R value will then be added to each R,G,B value of the pixel as the random offset. 

In [None]:
# Define function that will add some random 
# amount of noise to each pixel. Uses a uniform distribution
# sample to generate noise
# r1, r2 is range for uniform distribution
# p is probability that a pixel will have noise added
def RandomNoise(r1 = -30, r2 = 30, p = 0.3):
    
    def image_noise(image):
        w = image.width
        h = image.height
        # For each pixel, need to add some random offset
        # Generate some random noise
        for i in range(h):
            for j in range(w):
                # Skip pixel with p probability
                if (np.random.ranf() < p):
                    continue
                    
                # Select random offsets
                r = np.random.randint(r1, r2 + 1)
                g = np.random.randint(r1, r2 + 1)
                b = np.random.randint(r1, r2 + 1)
                                
                oldPix = image.getpixel((i,j))
                newPix = tuple(np.add(oldPix, (r,g,b)))
                image.putpixel((i,j), newPix)
                
        return image
    
    totalTransform = transforms.Compose([
        transforms.Lambda(image_noise),
    ])
    
    return totalTransform 

noise = transforms.Compose([
    transforms.Resize((224,224)),
    RandomNoise(),
    transforms.ToTensor()
])
VisualizeTransform(transform = noise)

<font size="4"><b>Random Scaled Crop</b></font>

One thing that we can see in our dataset is that many of the birds are centered directly in the middle of the image. There are a lot of background pixels that don't have anything to do with the bird around the edges of the image.

We could take advantage of this fact by trying to crop out a portion of the edges of the image to make the central portion larger. This will increase the fidelity of the bird portion of the image, which might give our network more information to work with in classifying images. 

To implement random noise, we define the following function, ```RandomScaledCrop```. This function will take in 3 parameters: p, marginL, marginH. The p determines the probability that the image will recieve the scaled crop transformation at all. Then, a random value M between marginL and marginH will be chosen uniformly. This M value will then be the percent of the edges that are cutoff. As an example, a M value of 0.1 means that 10% of the width and height will be removed from the edges.

In [None]:
# Function for returning a Random Centered Crop of the image. The function gives the option to specify
# the size of the crop by percent area, as well as specifying a percent margin range to center the crop in.
def RandomScaledCrop(p = 1, marginL = .20, marginH = 0.25):
    
    def image_crop(image):
        w = image.width
        h = image.height
        
        # Calculate a random offset between margin range
        # which will act as padding against all four sides
        randRange = np.random.ranf() * (marginH - marginL) + marginL

        widthPadd = w * randRange
        heightPadd = h * randRange
        
        # Define two corners of the crop
        lowerH = heightPadd
        lowerW = widthPadd
        upperH = h - heightPadd
        upperW = w - widthPadd
        
        image = image.crop((lowerH, lowerW, upperH, upperW))        
    
        return image
    
    # First verify that inputs are reasonable, otherwise
    # just return the default transform
    if marginH >= 0.5 or marginL > marginH:
        return None
    if (np.random.ranf() > p):
        return None
    
    totalTransform = transforms.Compose([
        transforms.Lambda(image_crop),
    ])
    
    return totalTransform  

scaledCenterCrop = transforms.Compose([
    transforms.Resize((224,224)),
    RandomScaledCrop(),
    transforms.ToTensor()
])
VisualizeTransform(transform = scaledCenterCrop)

<font size="4"><b>Random Stitch</b></font>

Another thing that we can see in our dataset is that the bird area is very small compared to the background area. Ideally, we would prefer 50% or greater area with bird pixels, so the neural networks have a lot of relevant information to train on. We can try to counteract this by multiplying the bird area relative to the background area. This can be done by taking a crop of the bird and stitching multiple crops of the bird together. 

To implement random stitch, we define the following function, ```RandomStitch```. This function will take in 2 parameters: p, side. The p determines the probability that the image will a stitch transformation at all. Then, side will be the number of boxes in the final stitch. The random stitch will then be constructed from several random crops stitched together. 

In [None]:
# Function for returning a Random Stitch of the image. The function gives the option to specify
# probability that stitch will be applied, as well as specifying the number of sides to stitch
def RandomStitch(p = 1, side = 2):
    def image_stitch(image):
        w = image.width
        h = image.height
        
        # We want side number of stitches
        wPerSide = w // side
        hPerSide = h // side
        
        # First choose a random offset to start from
        randW = np.random.randint(0, min(w - wPerSide, w)) 
        randH = np.random.randint(0, min(h, h - hPerSide))
        
        newImage = PIL.Image.new(mode = 'RGB', size = (h, w))
        
        # For each side, and for each box, write out the 
        for i in range(side):
            for j in range(side):
                randW = np.random.randint(0, w - wPerSide)
                randH = np.random.randint(0, h - hPerSide)
                for x in range(wPerSide * i, wPerSide * (i+1)):
                    for y in range(hPerSide * j, hPerSide * (j+1)):
                        # Error checking to make sure that image is within bounds
                        if (randW + x - wPerSide * i >= 224 or randH + y - hPerSide * j >= 224):
                            print((randW,randH))
                            print((randW + x - wPerSide * i, randH + y - hPerSide * j))
                            continue
                            
                        pix = image.getpixel((randW + x - wPerSide * i, randH + y - hPerSide * j))
                        newImage.putpixel((x,y), pix)
                        
                        
        return newImage
                                    
    
    # First verify that inputs are reasonable, otherwise
    # just return the default transform
    if side <= 0:
        return None
    if (np.random.ranf() > p):
        return None
    
    totalTransform = transforms.Compose([
        transforms.Lambda(image_stitch),
    ])
    
    return totalTransform  

randomStitch = transforms.Compose([
    transforms.Resize((224,224)),
    RandomStitch(),
    transforms.ToTensor()
])
VisualizeTransform(transform = randomStitch)

<font size="4"><b>Experiments</b></font>

Now that we've got our basic augmentations written out, we need to test them on our data set. The following code sets up a test suite that tests each individual augmentation against a baseline model with no augmentation.

For each augmentation, we will apply it to a 224x224 image, then train it on EfficientNet2 for 6 epochs. Then, we will test the accuracy of the model on the testing set. We will also graph the losses of each model for visualization.

Afterwards, we will re-evaluate the results and see which augmentations were effective, and which were not. Then, we will need to decide which ones we will keep.

In addition to our custom transformations, we also define some additional ones using the standard pytorch library. We will also experiment with changing the colorspace (brightness, saturation, hue), the sharpness, and normalizing the dataset against a common mean and standard deviation. 

As a final note, unfortunately we did not have time to test all possible parameters, as our GPU compute and time is limited. We chose a select subset of parameters that we thought would be a good fit, and tested on that. However, it is entirely possible that a better set of parameters exist, which we did not test.

In [None]:
# Default
checkpoint_path = '/kaggle/working/default/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))


# Occlusion
checkpoint_path = '/kaggle/working/occlusion/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    RandomOcclusion(),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))


# Flip
checkpoint_path = '/kaggle/working/flip/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    RandomFlip(),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# ScaledCenterCrop
checkpoint_path = '/kaggle/working/SCC/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    RandomScaledCrop(),
    transforms.Resize((224,224)),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Sharpness
checkpoint_path = '/kaggle/working/sharpness/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomAdjustSharpness(2),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# ColorJitter
checkpoint_path = '/kaggle/working/jitter/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Test Normalize
checkpoint_path = '/kaggle/working/normalize/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transformTrain = transforms.Compose([
    DefaultTransform(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

transformTest = transforms.Compose([
    DefaultTransform(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

data = getData(transform_train = transformTrain, transform_test = transformTest)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Random Noise
checkpoint_path = '/kaggle/working/noise/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    RandomNoise(),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Random Stitch
checkpoint_path = '/kaggle/working/stitch/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    RandomStitch(),
    transforms.ToTensor()
])

data = getData(transform_train = transform)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

<font size="4"><b>Results</b></font>

According to our results, we had the following testing accuracy measurements:

**Default:** 0.7868257261410788

**Random Occlusion:** 0.7916234439834025

**Random Flip:** 0.8001815352697096

**Random Scaled Crop:** 0.7547977178423236

**Sharpness:** 0.7841026970954357

**Random Color Jitter:** 0.7593360995850622

**Normalize:** 0.79201244813278

**Random Noise:** 0.619242738589212

**Random Stitch:** 0.7141856846473029

According to our experiments, the only augmentations that shows a definite testing accuracy improvement over the default base model is the Random Occlusion, Random Flip, and Normalize transformation. As such, we will add these to the final model, and see how they benefit the final testing accuracy. Quite disappointingly, our invented augmentations such as Random Stitch and Random Noise did significantly worse than we expected, but perhaps we need to tune the parameters better or train for longer.

# ****Step 7: Tuning Hyperparameters****
Finally, our model is coming into focus. We will use a EfficientNet model, with Random Occlusion, Random Flip, and Normalize data augmentations. However, there are a couple of last steps to be taken. We still need to finetune the hyperparameters for the model that will be used during training. In particular, we will look at image input size, weight decay, and learning rate as the particular parameters that we will try to optimize.

<font size="4"><b>Image Size</b></font>

One of the questions that we still need to answer is how large our input image size should be. Intuitively, having a larger image input size should make the model better, since there is more information, but at the same time too large of an image input size may lead to underfitting, since now the image dimensions are too large for the model to learn effectively. Thus a balance must be achieved.

For our experiments, we will test four different image sizes: 224 x 224, 256 x 256, 384 x 384, and 516 x 516. Why 384 x 384? According to the original EfficientNet2 paper, the model was trained using a 384 x 384 central crop on the images. Thus, we thought it would be important to test that particular size. 

The following code defines the experiment code. Each image size is input into an EfficientNet Model and then trained for 6 epochs. Then, the testing accuracy is calculated and output to standard out.

In [None]:
# Set up image pixel size experiments

# 224
checkpoint_path = '/kaggle/working/image224/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 224),
    transforms.ToTensor()
])

data = getData(transform_train = transform, size = 224)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# 256
checkpoint_path = '/kaggle/working/image256/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 256),
    transforms.ToTensor()
])

data = getData(transform_train = transform, size = 256)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))


# 384
checkpoint_path = '/kaggle/working/image384/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 384),
    transforms.ToTensor()
])

data = getData(batch = 16, transform_train = transform, size = 384)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))


# 512
checkpoint_path = '/kaggle/working/image512/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 512),
    transforms.ToTensor()
])

data = getData(batch = 8, transform_train = transform, size = 512)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

According to our run through, the relative testing accuracies are as follows:

**Image Size 224:** 0.7900674273858921

**Image Size 256:** 0.8031639004149378

**Image Size 384:** 0.8408973029045643

**Image Size 512:** 0.8220954356846473

From our results, we can deduce that an image size of 384 x 384 is by far the best choice. It has nearly a 2% lead over the next best image size, which was 512 x 512. 

<font size="4"><b>Weight Decay</b></font>

Next, we need to figure out the optimal weight decay for our model. The question of what weight decay is essentially the question of how complex our model should be. Should the model be allowed to fit a very complex model for the data, or should it be contrained in some way? We expect that a lower weight decay will be better, since the classification problem is very complex - there are 555 classes to fit with a very large 384 x 384 input size. 

For our experiments, we will test four different weight decays: 0.1, 0.01, 0.001, 0.0001. The following code defines the experiment code. Each weight decay is tuned for an EfficientNet Model for a 224 x 224 image and then trained for 6 epochs. Then, the testing accuracy is calculated and output to standard out.

In [None]:
# Test different weight decays
# 0.1
checkpoint_path = '/kaggle/working/decay1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 224),
    transforms.ToTensor()
])

data = getData(transform_train = transform, size = 224)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.1)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path, optimizer = optimizer)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Test different weight decays
# 0.01
checkpoint_path = '/kaggle/working/decay01/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 224),
    transforms.ToTensor()
])

data = getData(transform_train = transform, size = 224)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.01)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path, optimizer = optimizer)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

# Test different weight decays
# 0.001
checkpoint_path = '/kaggle/working/decay001/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 224),
    transforms.ToTensor()
])


data = getData(transform_train = transform, size = 224)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.001)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path, optimizer = optimizer)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))


# Test different weight decays
# 0.0001
checkpoint_path = '/kaggle/working/decay0001/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transform = transforms.Compose([
    DefaultTransform(size = 224),
    transforms.ToTensor()
])

data = getData(transform_train = transform, size = 224)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)
losses = train(net, data['train'], epochs=6, schedule={0:.01, 3:.001}, print_every=10, checkpoint_path=checkpoint_path, optimizer = optimizer)
plt.plot(smooth(losses,50))
print(accuracy(net, data['test']))

According to our run through, the relative testing accuracies are as follows:

**Decay 0.1:** 0.0016856846473029046

**Decay 0.01:** 0.38731846473029047

**Decay 0.001:** 0.785658713692946

**Decay 0.0001:** 0.7857883817427386

It looks like our hypothesis was mostly correct. It seems that a lower weight decay is the best choice for the model. This again makes sense, given the inherent complexity of the problem. For our model, we will choose a weight decay of 0.0001. 

<font size="4"><b>Learning Rate</b></font>

The learning rate of our model is perhaps the most important hyperparameter to use, but also the hardest to optimize. It is difficult to say what learning rate is best, since we should also iteratively relax the learning rate as the epochs increase. The question of what schedule is best for the model is difficult to find by brute force search, so instead we will use a more hueristic approach. We will say that it is better to train the model with a larger learning rate in the beginning, and then decrease the rate by an order of magnitude every couple of epochs.

The schedule that we define for this problem is as follows: 4 epochs of 0.01 learning rate, 4 epochs of 0.001 learning rate, and 15 epochs of 0.0001 learning rate. The idea is that the model should converge quickly to a local optima in the first four epochs, then for the next 15 epochs it will slowly reach the final local optima over many iterations of small steps.

The key here is that we will use a principle of early stopping. Early stopping is a training principle that continuously calculates the testing accuracy for each epoch. At any point, if the model's accuracy decreases from the previous epoch, then training is stopped. This method can help prevent losses from overfitting. We will rewrite the `train` function to implement the early stopping mechanism.




In [None]:
# Function to train model
def TrainEarlyStopping(net, dataloader, epochs=1, start_epoch=0, optimizer = None, 
          verbose=1, print_every=10, state=None, schedule={}, checkpoint_path=None):
    net.to(device)
    net.train()
    losses = []
    criterion = nn.CrossEntropyLoss()
    # We will use SGD for the default optimizer
    if (optimizer is None):    
        optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)
    
    # Define current highest testing accuracy
    testingAcc = 0
    
    # Load previous training state
    if state:
        net.load_state_dict(state['net'])
        optimizer.load_state_dict(state['optimizer'])
        start_epoch = state['epoch']
        losses = state['losses']

    # Fast forward lr schedule through already trained epochs
    for epoch in range(start_epoch):
        if epoch in schedule:
            print ("Learning rate: %f"% schedule[epoch])
            for g in optimizer.param_groups:
                g['lr'] = schedule[epoch]

    for epoch in range(start_epoch, epochs):
        sum_loss = 0.0

        # Update learning rate when scheduled
        if epoch in schedule:
            print ("Learning rate: %f"% schedule[epoch])
            for g in optimizer.param_groups:
                g['lr'] = schedule[epoch]

        for i, batch in enumerate(dataloader, 0):
            inputs, labels = batch[0].to(device), batch[1].to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()  # autograd magic, computes all the partial derivatives
            optimizer.step() # takes a step in gradient direction

            losses.append(loss.item())
            sum_loss += loss.item()

            if i % print_every == print_every-1:    # print every 10 mini-batches
                if verbose:
                    print('[%d, %5d] loss: %.3f' % (epoch, i + 1, sum_loss / print_every))
                sum_loss = 0.0
        if checkpoint_path:
            state = {'epoch': epoch+1, 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'losses': losses}
            torch.save(state, checkpoint_path + 'checkpoint-%d.pkl'%(epoch+1))
        
        # If the testing accuracy decreases, then return early
        currAcc = accuracy(net, testloader)
        if (currAcc >= testingAcc):
            testingAcc = currAcc
        else:
            print("Testing accuracy decreased at epoch " + str(epoch))
            return losses
        
    return losses

# ****Step 8: Final Model and Results****
We've finally reached final model! The exact parameters are as follows:

**Model:** EfficientNet2_s, pre-trained on ImageNet

**Input Image Size:** 384 x 384

**Learning Rate:** 4 epochs of 0.01 learning rate, 4 epochs of 0.001 learning rate, and 15 epochs of 0.0001 learning rate

**Weight Decay:** 0.0001

**Momentum:** 0.9

**Batch Size:** 16

**Augmentations:** Normalization, Random Occlusion, Random Flipping/Rotation

The code for implementing this model is shown below. 

With this model, we had a final testing accuracy of 0.8559387966804979. Not bad, given the GPU compute and time constraints!

In [None]:
# Final
checkpoint_path = '/kaggle/working/final/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
    
net = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.IMAGENET1K_V1)
net.classifier[1] = nn.Linear(net.classifier[1].in_features, 555)

transformTrain = transforms.Compose([
    transforms.Resize((384,384)),
    RandomFlip(),
    RandomOcclusion(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

transformTest = transforms.Compose([
    transforms.Resize((384,384)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

data = getData(transform_train = transformTrain, transform_test = transformTest, size = 384, batch = 16)
losses = TrainEarlyStopping(net, data['train'], epochs=23, schedule={0:.01, 4:.001, 8:.0001}, print_every=10, checkpoint_path=checkpoint_path)
plt.plot(smooth(losses,50))

predict(net, data['final'], "/kaggle/working/final/" + "submissions.csv")