In [None]:
import os
gdrive_path = "/content/drive/MyDrive/Murdoch/ICT303/A2"

if os.path.exists(gdrive_path):
    print("Drive is already mounted.")
else:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


### **Downloading the Data Set**


In [None]:
import zipfile
from tqdm import tqdm

data_dir = "/content/drive/MyDrive/Murdoch/ICT303/A2/kaggle_dog"

zipfiles = ['train.zip', 'test.zip', 'labels.csv.zip']
for f in tqdm(zipfiles):
  with zipfile.ZipFile(data_dir + '/' + f, 'r') as z:
    z.extractall(data_dir)

100%|██████████| 3/3 [04:15<00:00, 85.01s/it]


### **1.2. Organizing the Data Set**

Next, we define the reorg_train_valid function to split the validation set from the original Kaggle competition training set. The parameter valid_ratio in this function is the ratio of the number of examples of each dog breeds in the validation set to the number of examples of the
breed with the least examples (66) in the original training set.

After organizing the data, images of the same breed will be placed in the same folder so that we can read them later.

In [None]:
# Let's first install d2l package, since we will need some functions from this package
! pip install d2l==1.0.0a1.post0

In [None]:
## Function to create directory if it doesnt exist, instead of using the d2l module
import os
def mkdir_if_not_exist(path):
    if not os.path.exists(os.path.join(*path)):
        os.makedirs(os.path.join(*path))

In [None]:
import collections
import d2l
import shutil
import os
import math

def reorg_train_valid(data_dir, train_dir, input_dir, valid_ratio, idx_label):
  # The number of examples of the least represented breed in the training set.
  min_n_train_per_label = (
      collections.Counter(idx_label.values()).most_common()[:-2:-1][0][1])

  # The number of examples of each breed in the validation set.
  n_valid_per_label = math.floor(min_n_train_per_label * valid_ratio)
  label_count = {}
  for train_file in os.listdir(os.path.join(data_dir, train_dir)):
    idx = train_file.split('.')[0]
    label = idx_label[idx]

    mkdir_if_not_exist([data_dir, input_dir, 'train_valid', label])

    shutil.copy(os.path.join(data_dir, train_dir, train_file),
                os.path.join(data_dir, input_dir, 'train_valid', label))

    if label not in label_count or label_count[label] < n_valid_per_label:
      mkdir_if_not_exist([data_dir, input_dir, 'valid', label])
      shutil.copy(os.path.join(data_dir, train_dir, train_file),
                  os.path.join(data_dir, input_dir, 'valid', label))
      label_count[label] = label_count.get(label, 0) + 1

    else:
      mkdir_if_not_exist([data_dir, input_dir, 'train', label])
      shutil.copy(os.path.join(data_dir, train_dir, train_file),
                  os.path.join(data_dir, input_dir, 'train', label))

## **Obtaining and Organizing the Data Set**

The competition data is divided into a training set and testing set:
- The training set contains $10,222$ color images.
- The testing set contains 10,357 color images.

The images in both sets are in JPEG format. Each image contains three channels (R, G and B). The images have  different heights and widths.

There are $120$ breeds of dogs in the training set, e.g., *Labradors, Poodles, Dachshunds,
Samoyeds, Huskies, Chihuahuas, and Yorkshire Terriers*.

The `reorg_dog_data` function below is used to read the training data labels, segment the validation set, and organize the training set.

In [None]:
def reorg_dog_data(data_dir, label_file, train_dir, test_dir, input_dir, valid_ratio):
  # Read the training data labels.
  with open(os.path.join(data_dir, label_file), 'r') as f:
    # Skip the file header line (column name).
    lines = f.readlines()[1:]
    tokens = [l.rstrip().split(',') for l in lines]
    idx_label = dict(((idx, label) for idx, label in tokens))

  reorg_train_valid(data_dir, train_dir, input_dir, valid_ratio, idx_label)

  # Organize the training set.
  mkdir_if_not_exist([data_dir, input_dir, 'test', 'unknown'])
  for test_file in os.listdir(os.path.join(data_dir, test_dir)):
    shutil.copy(os.path.join(data_dir, test_dir, test_file),
                os.path.join(data_dir, input_dir, 'test', 'unknown'))

During actual training and testing, we would use the entire Kaggle Competition data set and call the reorg_dog_data function to organize the data set. Likewise, we would need to set the batch_size to a larger integer, such as 128.

In [None]:
label_file, train_dir, test_dir = 'labels.csv', 'train', 'test'
input_dir, batch_size, valid_ratio = 'train_valid_test', 128, 0.1
reorg_dog_data(data_dir, label_file, train_dir, test_dir, input_dir, valid_ratio)

## DONE

## **Image Augmentation**

Sometimes, when we do not have enough images to train our deep learning model, we data augmentation to simulate new data. For example, in the case of images, assume we only have $10$ images per class. We can create more instance by applying transformations to these images. For example, if the image is of a standin dog, we can rotate it $90$ and $180$ degrees to create two additional instances of the same dog. We can also scale it, etc.

Here are some more image augmentation operations that might be useful.

Start by training your model on the data set, the way it is provided. Then, think of the types of transformations you can apply to the training images to improve the performance.

You can find more about how to apply transformations to images in this [link](https://pytorch.org/vision/stable/transforms.html).

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

labels_df = pd.read_csv(f"{gdrive_path}/kaggle_dog/labels.csv")

In [None]:
labels_df['breed'].value_counts()

scottish_deerhound      126
maltese_dog             117
afghan_hound            116
entlebucher             115
bernese_mountain_dog    114
                       ... 
golden_retriever         67
brabancon_griffon        67
komondor                 67
eskimo_dog               66
briard                   66
Name: breed, Length: 120, dtype: int64

In [None]:
labels_df['id'].unique()

array(['000bec180eb18c7604dcecc8fe0dba07',
       '001513dfcb2ffafc82cccf4d8bbaba97',
       '001cdf01b096e06d78e9e5112d419397', ...,
       'ffe2ca6c940cddfee68fa3cc6c63213f',
       'ffe5f6d8e2bff356e9482a80a6e29aac',
       'fff43b07992508bc822f33d8ffd902ae'], dtype=object)

In [None]:
px.bar(labels_df['breed'].value_counts(), title='Count of Dog Breeds')

## **Loading (Reading) the Data Set**

In [None]:
!pip install skorch

Collecting skorch
  Downloading skorch-0.15.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/239.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.3/239.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.15.0


In [None]:
#@title All imports
import os
from sklearn.model_selection import GridSearchCV
from skorch import NeuralNetClassifier
from torch import nn
from torch import optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import transforms, models, datasets
from torchsummary import summary
import numpy as np
import torch
from skorch.callbacks import Checkpoint, TrainEndCheckpoint
from skorch.callbacks import LoadInitState
from skorch.callbacks import EpochScoring
from skorch.helper import SliceDataset
from PIL import ImageFile, Image
from skorch.callbacks import Callback
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

In [None]:
## Custom transformations dictionary for all datasets for indexing use
transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(size=256),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    'valid': transforms.Compose([
        transforms.Resize(size=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    'train_valid': transforms.Compose([
        transforms.Resize(size=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]),
    'test': transforms.Compose([
        transforms.Resize(size=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
}


Similar to previous labs, write here the Python code tat reads the training, validation and test set.

In [None]:
## Create the datasets when the images are arranged in the specific format: './class_name/xx.png'
dir = f"{gdrive_path}/kaggle_dog/train_valid_test"
ds = {x: datasets.ImageFolder(os.path.join(dir, x), transforms[x])
         for x in ['train', 'valid', 'test', 'train_valid']}

## Wrap the image tensors and labels separately as SliceDataset objects so that they can be used in a grid search.
## X as input  and y as output

data = ['train', 'valid', 'test', 'train_valid']
data_dict = {}

for x in data:
    data_dict[x] = {'X': SliceDataset(ds[x], idx=0), 'y': SliceDataset(ds[x], idx=1)}

In [None]:
## Setting default device, GPU first or else CPU
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()
print(device)

cuda


In [None]:
## Define ResNet class
class CustomResNet(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.pretrained = pretrained_model
        # Add custom layers here.
        self.append1 = nn.Linear(250, 120)

    def forward(self, X):
        # Compute the output given the input X
        X = self.pretrained(X)
        X = self.append1(X)
        return X

In [None]:
## Define the pretrained ResNet model
resnet_pretrained = models.resnet18(pretrained=True)
num_ftrs = resnet_pretrained.fc.in_features
for param in resnet_pretrained.parameters():
    param.requires_grad = False

resnet_pretrained.fc = nn.Linear(num_ftrs, 250)

## Instantiate the custom model
custom_model = CustomResNet(pretrained_model=resnet_pretrained)

## Move the model to the appropriate device (run on Cuda GPU)
custom_model = custom_model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 152MB/s]


In [None]:
## Creating a checkpoint when validation loss improves.
cp = Checkpoint(dirname=f'{gdrive_path}/experiment12', f_params='params_{last_epoch[epoch]}.pt', monitor='valid_loss_best')

# Defining a neural net classifier based on the model defined earlier.
net = NeuralNetClassifier(
    custom_model,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    optimizer=torch.optim.Adam,
    iterator_train__shuffle=True,
    device=device,
    callbacks=[cp, EpochScoring(scoring='accuracy', name='train_acc', on_train=True)]
)

## Grid of hyperparameters to run grid search over.
params = {
    'lr': [0.1, 0.01],
    # 'max_epochs': [5, 10],
    'batch_size': [128]
}

## Instantiating GridsearchCV for hyperparameter tuning. Default number of folds is 5.
gs = GridSearchCV(net, params, refit=True, cv=2, scoring='accuracy')

## Fitting the model on the train_valid dataset using the best set of hyperparameters found.
search = gs.fit(data_dict['train_valid']['X'] , data_dict['train_valid']['y'])

  epoch    train_acc    train_loss    valid_acc    valid_loss    cp      dur
-------  -----------  ------------  -----------  ------------  ----  -------
      1       [36m0.0250[0m     [32m1105.9892[0m       [35m0.0469[0m      [31m822.4342[0m     +  82.7755
      2       0.2243      [32m398.0304[0m       [35m0.3284[0m      [31m185.2923[0m     +  83.5915
      3       0.4190      [32m160.0951[0m       [35m0.4301[0m      [31m161.2912[0m     +  82.6588
      4       0.6049       [32m79.3159[0m       [35m0.5582[0m       [31m91.8884[0m     +  82.3307
      5       0.6734       [32m51.8980[0m       0.5318      113.8394        82.1623
      6       0.6539       59.0889       0.5396      103.4143        82.1744
      7       0.6959       [32m46.6743[0m       0.5327      110.8362        81.3448
      8       0.7370       [32m37.8860[0m       0.5455      105.3266        81.8155
      9       0.7224       45.4747       [35m0.5699[0m      105.0908        81.3315


In [None]:
## Save search results
import pickle

# Save the search object
with open(f'{gdrive_path}/grid_search_results.pkl', 'wb') as f:
    pickle.dump(search, f)

In [None]:
# Generate matrix of predicted probabilities for the test set.
pred_prob = search.predict_proba(data_dict['test']['X'])

In [None]:
# Save the pred_prob object
with open(f'{gdrive_path}/pred_prob.pkl', 'wb') as f:
    pickle.dump(pred_prob, f)

In [None]:
print(pred_prob)

[[1.15323839e-09 1.58720177e-17 2.86248307e-19 ... 2.58924354e-21
  2.62892352e-22 1.32975296e-14]
 [8.39775791e-27 4.70906563e-33 6.33051386e-31 ... 2.59947617e-30
  8.35514677e-31 1.45292146e-30]
 [2.87608251e-17 1.22111292e-13 1.44774236e-16 ... 3.63143794e-14
  2.51296976e-13 2.49446341e-14]
 ...
 [2.87616840e-16 6.42553635e-17 1.65925506e-15 ... 9.06082143e-09
  3.51831786e-10 6.01781401e-14]
 [5.14962639e-12 5.35713546e-19 3.50179597e-22 ... 1.93557798e-17
  3.51228395e-15 2.21726131e-12]
 [2.10627439e-23 1.56444871e-10 3.91524105e-17 ... 7.89166562e-12
  1.10499815e-16 1.27379185e-24]]


## Start Testing

In [None]:
# Generate csv file of predicted probabilities.
ids = sorted(os.listdir(os.path.join(f'{gdrive_path}', 'kaggle_dog/train_valid_test/test/unknown')))

with open(f'{gdrive_path}/submission.csv', 'w') as f:
    f.write('id,' + ','.join(ds['train'].classes) + '\n')
    for i, output in zip(ids, pred_prob):
        f.write(i.split('.')[0] + ',' + ','.join([str(num) for num in output]) + '\n')

In [None]:
# Custom callback to add training/validation loss/accuracy to Tensorboard.
class TensorboardMetrics(Callback):
    def __init__(self, tb):
        self.writer = tb
    # This runs after every epoch
    def on_epoch_end(self, net, **kwargs):
        self.train_loss = net.history[:, 'train_loss'][-1]
        self.valid_loss = net.history[:, 'valid_loss'][-1]
        self.train_acc = net.history[:, 'train_acc'][-1]
        self.valid_acc = net.history[:, 'valid_acc'][-1]

        # Add current epoch training/validation loss to tensorboard.
        self.writer.add_scalars('Loss', {'Training Loss': self.train_loss, 'Validation Loss':
                                          self.valid_loss}, len(net.history))

        # Add current epoch training/validation accuracy to tensorboard.
        self.writer.add_scalars('Accuracy', {'Training Accuracy': self.train_acc, 'Validation Accuracy':
                                            self.valid_acc}, len(net.history))

In [None]:
# Retrieve the best parameters found by GridSearchCV
best_params = search.best_params_
print(best_params)

{'batch_size': 128, 'lr': 0.01}


In [None]:
## Tensor board magic command
%reload_ext tensorboard
%tensorboard --logdir=f"{gdrive_path}/test_run/dog-breed-identify"

In [None]:
# Setting up the writer for Tensorboard.
writer = SummaryWriter(f"{gdrive_path}/test_run/dog-breed-identify")

class model(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.pretrained = pretrained_model
        # Add custom layers here.
        self.append1 = nn.Linear(250, 120)

    def forward(self, X):
        # Compute the output given the input X
        X = self.pretrained(X)
        X = self.append1(X)
        return X

# Defining the pretrained model.
pretrained_model = models.resnet18(weights='DEFAULT')
num_ftrs = pretrained_model.fc.in_features
for param in pretrained_model.parameters():
    param.requires_grad = False
pretrained_model.fc = nn.Linear(num_ftrs, 250)

# Defining the model to be trained.
model = model(pretrained_model=pretrained_model)
model = model.to(device)

# Creating checkpoints.
cp = Checkpoint(dirname=f"{gdrive_path}/experiment13", f_params="params_{last_epoch[epoch]}.pt")
load_state = LoadInitState(cp)

# Defining the neural net classifier.
net = NeuralNetClassifier(
    model,
    max_epochs=10,
    criterion=nn.CrossEntropyLoss(),
    lr=0.01,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device = device,
    optimizer = torch.optim.Adam,
    batch_size = 128,
    callbacks=[cp, load_state, EpochScoring(scoring='accuracy', name='train_acc', on_train=True),
    TensorboardMetrics(tb=writer)],
)

# Initializing the neural net classifier before loading parameters.
net.initialize()
net.load_params(f_params=f"{gdrive_path}/experiment12/params_4.pt")
net.history = net.history.from_file(f"{gdrive_path}/experiment12/history.json")
history = net.history

# Adding the current history of metrics to Tensorboard.
for i in range(len(net.history)):
    if 'train_acc' in net.history[i]:
        writer.add_scalars('Loss', {'Training Loss': history[i]['train_loss'], 'Validation Loss':
                                    history[i]['valid_loss']}, i)
        writer.add_scalars('Accuracy', {'Training Accuracy': history[i]['train_acc'], 'Validation Accuracy':
                                          history[i]['valid_acc']}, i)
    else:
        writer.add_scalars('Loss', {'Training Loss': history[i]['train_loss'], 'Validation Loss':
                                    history[i]['valid_loss']}, i)
        writer.add_scalars('Accuracy', {'Validation Accuracy': history[i]['valid_acc']}, i)

# Fitting the model to the training/validation set.
net.fit(data_dict['train_valid']['X'], data_dict['train_valid']['y'])


# Call flush() method to make sure that all pending events have been written to disk.
writer.flush()

# If you do not need the summary writer anymore, call close() method.
writer.close()

Re-initializing module.
Re-initializing criterion.
Re-initializing optimizer.
  epoch    train_acc    train_loss    valid_acc    valid_loss    cp       dur
-------  -----------  ------------  -----------  ------------  ----  --------
      2       [36m0.7335[0m        [32m1.2893[0m       [35m0.6205[0m        [31m2.5326[0m     +  171.7916
      3       [36m0.7231[0m        1.5534       [35m0.6401[0m        2.9467        170.7920
      4       0.7336        1.6933       0.6176        3.7767        176.9011
      5       0.7384        1.8476       0.5956        4.2639        176.4366
      6       0.7537        1.8554       0.6191        4.6584        177.7974
      7       0.7478        2.2641       0.5951        5.4450        175.5464
      8       0.7539        2.3766       [35m0.6411[0m        5.7421        175.6464
      9       0.7697        2.3375       0.6156        6.4036        176.3444
     10       0.7898        2.3670       0.6240        6.7343        176.0965


some notes:
custom validation set and callbacks
grid search
resnet.history to get all the values of each epochs
PCA - closely linked to linear auotencoders