# 1. Import modules

In [1]:
import sys
import os
import numpy as np
import gc
from datetime import datetime

In [2]:
import torch
import torchvision
from torchvision import datasets, models, transforms
import torch.nn as nn

# 2. Define the device for training

In [3]:
DEVICE = "cuda:2" if torch.cuda.is_available() else "cpu"

# 3. Define model

In [4]:
def getVGGModel():
  vgg16 = models.vgg16_bn(weights=models.vgg.VGG16_BN_Weights.IMAGENET1K_V1)

  # Fix the conv layers parameters
  for conv_param in vgg16.features.parameters():
    conv_param.require_grad = False

  # Replace w/ new classification layers
  classifications = nn.Sequential(
    nn.Linear(25088,1024),
    nn.ReLU(inplace=True),
    nn.Dropout(p=0.5),
    nn.Linear(1024,3)
  )

  vgg16.classifier = classifications

  return vgg16

In [5]:
model = getVGGModel()
    
model.to(DEVICE)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

# 4. Define hyperparameters

In [6]:
hp = {"lr":1e-5, "beta1":0.9, "beta2":0.999, "batch_size":16, "epochs":5}

# 5. Load in images and define data augmentation

In [7]:
def load_datasets(train_path, val_path, test_path):
  val_img_transform = transforms.Compose([transforms.Resize((244,244)),
                                         transforms.ToTensor()])
  train_img_transform = transforms.Compose([transforms.AutoAugment(),
                                           transforms.Resize((244,244)),
                                           transforms.ToTensor()])
  train_dataset = datasets.ImageFolder(train_path, transform=train_img_transform)
  val_dataset = datasets.ImageFolder(val_path, transform=val_img_transform) 
  test_dataset = datasets.ImageFolder(test_path, transform=val_img_transform) if test_path is not None else None
  print(f"Train set size: {len(train_dataset)}, Validation set size: {len(val_dataset)}")
  return train_dataset, val_dataset, test_dataset
    
def construct_dataloaders(train_set, val_set, test_set, batch_size, shuffle=True):
  train_dataloader = torch.utils.data.DataLoader(train_set, batch_size, shuffle)
  val_dataloader = torch.utils.data.DataLoader(val_set, batch_size) 
  test_dataloader = torch.utils.data.DataLoader(test_aset, batch_size) if test_path is not None else None
  return train_dataloader, val_dataloader, test_dataloader

In [8]:
# Please specify the path to train, cross_validation, and test images below:
train_path, val_path, test_path = "/tmp/Dataset_2/Train/", "/tmp/Dataset_2/Validation/", None
train_set, val_set, test_set = load_datasets(train_path, val_path, test_path)
train_dataloader, val_dataloader, test_dataloader = construct_dataloaders(train_set, val_set, test_set, hp["batch_size"], True)

Train set size: 1322, Validation set size: 363


# 6. Define optimizer

In [9]:
opt = torch.optim.Adam(model.parameters(),lr=hp["lr"], betas=(hp["beta1"], hp["beta2"]))

# 7. Define loss function
### To deal with mis-labeling of data
### $new\_onehot\_labels = onehot\_labels * (1 - label\_smoothing) + label\_smoothing / num\_classes$

### Assuming label_smoothing = 0.2
### 0 — not damaged, 1 — most damaged

### A most damaged image would have lable [0, 1]
### $new\_onehot\_labels = [0, 1] * (1 - 0.2) + 0.2 / 2 =[0, 1]*(0.8) + 0.1$
### $new\_onehot\_labels =[0.1, 0.9]$

In [10]:
loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)

# 8. Define learning rate reducer

In [11]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min',factor=0.1, patience=5,min_lr=1e-8, verbose=True)

# 9.  Set up checkpoints

In [12]:
def load_checkpoint(checkpoint_path, DEVICE):
  checkpoint = torch.load(checkpoint_path, map_location=torch.device(DEVICE))
  return checkpoint

def load_model_fm_checkpoint(checkpoint, primitive_model):
  primitive_model.load_state_dict(checkpoint['model_state_dict'])
  return primitive_model 

In [13]:
# For saving the trained model
model_folder_path = os.getcwd()+"/output_model/"
os.makedirs(model_folder_path,exist_ok=True)

checkpoint_file = model_folder_path+"best_model.pt"

# load the checkpoint that has the best performance in previous experiments
prev_best_val_acc = None
checkpoint_file = model_folder_path+"best_model.pt"
if os.path.exists(checkpoint_file):
  checkpoint = load_checkpoint(checkpoint_file, DEVICE)
  prev_best_val_acc = checkpoint['accuracy']

# 10. Train model

## 10.1 Define train function

In [14]:
def train(train_loader, val_loader, model, opt, scheduler, loss_fn, epochs, DEVICE, checkpoint_file, prev_best_val_acc):
  n = len(train_loader)
  
  best_val_acc = torch.tensor(0.0).cuda() if prev_best_val_acc is None else prev_best_val_acc
    
  for epoch in range(epochs):
    model.train(True)
    
    avg_loss, val_loss, val_acc, avg_acc  = 0.0, 0.0, 0.0, 0.0
    
    start_time = datetime.now()
    
    for x, y in train_loader:
      x, y = x.to(DEVICE), y.to(DEVICE)
      pred = model(x)
      loss = loss_fn(pred,y)

      opt.zero_grad()
      loss.backward()
      opt.step()

      avg_loss += loss.item()/len(x)
      pred_label = torch.argmax(pred, axis=1)
      avg_acc += torch.sum(pred_label == y)/len(x)

    val_loss, val_acc = eval_model(val_loader, model, loss_fn, DEVICE)
    
    end_time = datetime.now()
    
    total_time = torch.tensor((end_time-start_time).seconds).cuda()
    
    # Learning rate reducer takes action
    scheduler.step(val_loss)
    
    avg_loss, avg_acc = avg_loss/n, avg_acc/n
        
    # Save the best model that has the highest val accuracy
    if val_acc.item() > best_val_acc.item():
      print(f"\nPrev Best Val Acc: {best_val_acc} < Cur Val Acc: {val_acc}")
      # print("Saving the new best model...")
      # torch.save({
      #         'epoch':epoch,
      #         'machine':local_rank,
      #         'model_state_dict':model.module.state_dict(),
      #         'accuracy':val_acc,
      #         'loss':val_loss
      # }, checkpoint_file)
      best_val_acc = val_acc
      print("Finished saving model\n")
        
    # Print the metrics (should be same on all machines)
    print(f"\n(Epoch {epoch+1}/{epochs}) Time: {total_time}s")
    print(f"(Epoch {epoch+1}/{epochs}) Average train loss: {avg_loss}, Average train accuracy: {avg_acc}")
    print(f"(Epoch {epoch+1}/{epochs}) Val loss: {val_loss}, Val accuracy: {val_acc}")  
    print(f"(Epoch {epoch+1}/{epochs}) Current best val acc: {best_val_acc}\n")  

## 10.2 Define evaluation function

In [15]:
@torch.no_grad()
def eval_model(data_loader, model, loss_fn, DEVICE):
  model.train(False)
  model.eval()
  loss, accuracy = 0.0, 0.0
  n = len(data_loader)

  for i, data in enumerate(data_loader):
    x,y = data
    x,y = x.to(DEVICE), y.to(DEVICE)
    pred = model(x)
    loss += loss_fn(pred, y)/len(x)
    pred_label = torch.argmax(pred, axis = 1)
    accuracy += torch.sum(pred_label == y)/len(x)
    
  return loss/n, accuracy/n

## 10.3 Start training

In [16]:
train(train_dataloader, val_dataloader, model, opt, scheduler, loss_fn, hp["epochs"], DEVICE, checkpoint_file, prev_best_val_acc)


(Epoch 1/5) Time: 440s
(Epoch 1/5) Average train loss: 0.06460661358502974, Average train accuracy: 0.4932228624820709
(Epoch 1/5) Val loss: 0.055886149406433105, Val accuracy: 0.6783596873283386
(Epoch 1/5) Current best val acc: 0.7795454859733582


(Epoch 2/5) Time: 424s
(Epoch 2/5) Average train loss: 0.051237494309982624, Average train accuracy: 0.6847891211509705
(Epoch 2/5) Val loss: 0.047844577580690384, Val accuracy: 0.7339426875114441
(Epoch 2/5) Current best val acc: 0.7795454859733582


(Epoch 3/5) Time: 442s
(Epoch 3/5) Average train loss: 0.044255879009703555, Average train accuracy: 0.7697288990020752
(Epoch 3/5) Val loss: 0.04582887887954712, Val accuracy: 0.7381422519683838
(Epoch 3/5) Current best val acc: 0.7795454859733582


(Epoch 4/5) Time: 422s
(Epoch 4/5) Average train loss: 0.040498484698045685, Average train accuracy: 0.8144577741622925
(Epoch 4/5) Val loss: 0.044687677174806595, Val accuracy: 0.7598813772201538
(Epoch 4/5) Current best val acc: 0.779545485973