In [43]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt


from PIL import Image
from datetime import datetime

import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torch.utils.data as data
import torch.nn.functional as F

import torchvision
from torchvision import transforms
from torchvision import models

In [2]:
# dataset class to load data into the network
class mydata(data.Dataset):
    def __init__(self, image_folder_path):
        # image_folder_path is the train folder, validation folder, or test folder
        
        # folder of the images
        self.root_dir = image_folder_path
        
        # set up a list to hold the full path for each image
        self.image_names = []
        
        # set up a list to hold the labels of the images
        self.labels = []
        
        # obtain image file path and its label
        for wbc_type in os.listdir(self.root_dir):
            '''
            nn.CrossEntropyLoss expects logits in the shape of [batch_size, number_of_classes]
            and a target tensor of [batch_size] with class indices as its values.
            '''
            if not wbc_type.startswith('.'):
                if wbc_type in ['NEUTROPHIL']: 
                    label = 0 #label of neutrophil
                elif wbc_type in ['LYMPHOCYTE']: 
                    label = 1 #label of lymphocyte
                elif wbc_type in ['MONOCYTE']: 
                    label = 2 #label of monocyte
                elif wbc_type in ['EOSINOPHIL']:
                    label = 3 #label of eosinophil
                
                for image_filename in os.listdir(os.path.join(self.root_dir, wbc_type)):
                    self.image_names.append(os.path.join(self.root_dir, wbc_type, image_filename))
                    self.labels.append(label)
               
               
    def __len__(self):
        return len(self.image_names)
    
    
    def __getitem__(self, index):
        image = Image.open(self.image_names[index])
        label = self.labels[index]
        
        # convert the RGB image to tensor and normalize the tensor for loading into resnet-18
        preprocess = transforms.Compose([transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
        image_norm = preprocess(image)
        
        label_tensor = torch.tensor(label)
        #label_tensor = label_tensor.to(dtype=torch.long)
                
        return image_norm, label_tensor

# 2. resnet-18

In [23]:
# set hyperparameters
loss_func = nn.CrossEntropyLoss() # cross entropy loss is used for classification task
loss_func_name = 'cross entropy loss'

weight_init_pretrain = 'imagenet pretrained'
bias_init_pretrain = 'imagenet pretrained'

weight_init_other = 'random'
bias_init_other = 'random'

learn_rate = 0.0001
batch_size = 2

num_epochs = 125

optimizer_name = 'Adam'

imagenet_pretrained = False
layer_freeze = False
frozen_layers = 2 # freeze the first frozen_layers layers in resnet-18

# track the start of each hyperparameter run(each run contains num_epochs of epochs)
start_time = datetime.now()

# overfitting correction hyperparameters
#early_stopping_thresh = -1
#early_stopping_num_epochs = 7
#epoch_train_accu_thresh = 0.95

'''Create a unique ID for this hyperparameter run.
   It is a folder that all relevent files are saved to (hyperparams file, training logs, model weights, etc.)'''
run_id = "hpt_Adam_34"
os.mkdir(run_id)

# record all hyperparameters that might be useful to reference later
with open(run_id + '/hyperparams.csv', 'w') as wfil:
#    wfil.write("note," + 'initialized with best weights of hpt_Adam_2' + '\n')
    wfil.write("learning rate," + str(learn_rate) + '\n')
    wfil.write("batch size," + str(batch_size) + '\n')
    wfil.write("number epochs," + str(num_epochs) + '\n')
    
    if imagenet_pretrained:
        wfil.write("weight initialization," + weight_init_pretrain + '\n')
        wfil.write("bias initialization," + bias_init_pretrain + '\n')
    else:
        wfil.write("weight initialization," + weight_init_other + '\n')
        wfil.write("bias initialization," + bias_init_other + '\n')
    
    if imagenet_pretrained:
        if layer_freeze:
            wfil.write("layers frozen," + 'first ' + str(frozen_layers) + ' layers' + '\n')
        else:
            wfil.write("layers frozen," + 'NO layer is frozen' + '\n')
    
    wfil.write("loss function," + loss_func_name + '\n')
    wfil.write("optimizer," + optimizer_name + '\n')
    
#    wfil.write("early stopping threshold," + str(early_stopping_thresh) + '\n')
#    wfil.write("early stopping number of epochs necessary," + str(early_stopping_num_epochs) + '\n')
#    wfil.write("early stopping epoch train accuracy threshold," + str(epoch_train_accu_thresh) + '\n')
    
    wfil.write("start time," + str(start_time) + '\n')

# use resnet-18
model = models.resnet18(pretrained=imagenet_pretrained)

'''
modify the model for our classification task:
Since there are 4 cell types, the last layer have 4 neurons.
'''
model.fc = nn.Linear(model.fc.in_features, 4)

# use weights from previous tuning and perform further fine tuning
# identify where the weights you want to load are 
#weight_fil = "hpt_Adam_2/best_weights.pth"
# load weights
#model = torch.load(weight_fil)

# layer freezing: freeze the first few designated number of layers
if layer_freeze:
    print(f'The first {frozen_layers} layers are frozen.')
    ct = 0
    for child in model.children():
        if ct < frozen_layers:
            #print(child)
            for param in child.parameters():
                param.requires_grad = False
            print(f'Layer {ct+1} is frozen!')
        else:
            break
            
        ct += 1

# This implementation use CUDA for gpu acceleration
# check if CUDA is available
cuda = torch.cuda.is_available()
if cuda:
    # set the network to use cuda
    model = model.cuda()
    print("CUDA IS AVAILABLE!")
else:
    print("CUDA NOT AVAILABLE!")

# create loaders to feed data to the network in batches
# image size is 640X480, which is larger than 224X224

# training dataset loader
train_set = mydata('train')
trainloader = torch.utils.data.DataLoader(dataset = train_set , batch_size= batch_size , shuffle = True)

# validation dataset loader
valid_set = mydata('validation')
validloader = torch.utils.data.DataLoader(dataset = valid_set , batch_size= batch_size , shuffle = True)

# use Adam as optimizer
optimizer = optim.Adam(model.parameters(), lr=learn_rate)

# learning rate scheduler
#scheduler = lr_scheduler.StepLR(optimizer, step_size =30, gamma=0.1)

# track best val loss to know when to save best weights
best_valid_loss = "unset"

# track stuff for early stopping
early_stopping_counter = 0
es_valid_loss = 0.0

with open(run_id + '/log_file.csv', 'w') as log_fil:
    # write headers for log file
    log_fil.write("epoch,epoch duration,train loss,valid loss,train accuracy,valid accuracy\n")
    
    for epoch in range(0, num_epochs):
                       
        epoch_start = datetime.now()
        
        # track train and validation loss
        epoch_train_loss = 0.0
        epoch_valid_loss = 0.0
        
        # track train and validation accuracy
        epoch_train_accuracy = 0.0
        epoch_valid_accuracy = 0.0
        
        for i, (train_images, train_labels) in enumerate(trainloader):
            #print(f'batch {i}, time {datetime.now()}')
            # for each batch, set images and labels to be cuda compatible
            train_images = Variable(train_images).cuda()
            train_labels = Variable(train_labels).cuda()

            # zero out gradients for every batch or they will accumulate
            optimizer.zero_grad()
            
            # forward step
            train_outputs = model(train_images)
            # sigmoid activation for output layer to match cross entropy loss funciton
            train_outputs = torch.sigmoid(train_outputs)
            
            # for loss function, convert the shape of outputs and labels to match
            #train_outputs = train_outputs.reshape(train_outputs.shape[0])
            #train_labels  = train_labels.reshape(train_labels.shape[0])
                        
            # compute loss
            train_loss = loss_func(train_outputs, train_labels)
            #print(f'train loss for batch {i} is {train_loss}')
             
            # backwards step
            train_loss.backward()         
            
            # update weights and biases
            optimizer.step()
            
            # track training loss
            epoch_train_loss += train_loss.item()
            
            # get training accuracy
            for i, image_label in enumerate(train_outputs):
                temp_max, temp_index = image_label.max(0) # the index of the max value for each image's output is the image's class
                #print(f'train output is {image_label}')
                #print(f'Output label is {temp_index}, true label is {train_labels[j]}. They are same: {temp_index==train_labels[j]}')
                if temp_index==train_labels[i]:
                    epoch_train_accuracy += 1
        
        epoch_train_accuracy = epoch_train_accuracy/len(train_set)
        
        # learning rate scheduler
        #scheduler.step()
        
        # track valid loss - the torch.no_grad() ensures gradients will not be updated based on validation set
        with torch.no_grad():
            for i, (valid_images, valid_labels) in enumerate(validloader):
                # for each batch, set images and labels to be cuda compatible
                valid_images = Variable(valid_images).cuda()
                valid_labels = Variable(valid_labels).cuda()

                valid_outputs = model(valid_images)
                valid_outputs = torch.sigmoid(valid_outputs)
                # for loss function, convert the shape of outputs and labels to match
                #valid_outputs = valid_outputs.reshape(valid_outputs.shape[0])
                #valid_labels  = valid_labels.reshape(valid_labels.shape[0])
            
                valid_loss = loss_func(valid_outputs, valid_labels)
                # track validation loss
                epoch_valid_loss += valid_loss.item()
                
                # get validation accuracy
                for i, image_label in enumerate(valid_outputs):
                    temp_max, temp_index = image_label.max(0)
                    if temp_index==valid_labels[i]:
                        epoch_valid_accuracy += 1
                
        epoch_valid_accuracy = epoch_valid_accuracy/len(valid_set)
            
           
                                        
        # track total epoch time
        epoch_end = datetime.now()
        epoch_time = (epoch_end - epoch_start).total_seconds()
        
        # save best weights
        if (best_valid_loss=="unset") or (epoch_valid_loss < best_valid_loss):
            best_valid_loss = epoch_valid_loss
            torch.save(model, run_id + "/best_weights.pth")
        
        # save most recent weights
        torch.save(model, run_id + "/last_weights.pth")
        
        # save epoch results in log file
        log_fil.write(str(epoch) + ',' +
                      str(epoch_time) + ',' +
                      str(epoch_train_loss) + ',' + 
                      str(epoch_valid_loss) + ',' + 
                      str(epoch_train_accuracy) + ',' +
                      str(epoch_valid_accuracy) + '\n')
        
        # print out epoch level training details
        print("epoch: " + str(epoch) + " - ("+ str(round(epoch_time)) + " seconds)" +
              " - train loss: " + str(round(epoch_train_loss, 3)) +
              " - valid loss: " + str(round(epoch_valid_loss, 3)) + 
              " - train accuracy: " + str(round(epoch_train_accuracy, 3)) + 
              " - valid accuracy: " + str(round(epoch_valid_accuracy, 3)))

'''
        # implement early stopping
        if es_valid_loss == 0.0:
            early_stopping_counter = 0
            es_valid_loss = epoch_valid_loss
            
        if es_valid_loss - epoch_valid_loss < early_stopping_thresh:
            early_stopping_counter += 1
        else:
            early_stopping_counter = 0
            es_valid_loss = epoch_valid_loss
        
        if (early_stopping_counter >= early_stopping_num_epochs):
            print("Stopped early by continuous increase of validation loss.")
            break
            
        
        if  (epoch_train_accuracy > epoch_train_accu_thresh):
            print('Stopped early by reaching epoch train accuracy threshold.')
            break
'''

end_time = datetime.now()
with open(run_id + '/hyperparams.csv', 'a') as wfil:
    wfil.write("end time," + str(end_time) + '\n')

CUDA IS AVAILABLE!
epoch: 0 - (75 seconds) - train loss: 635.838 - valid loss: 36.766 - train accuracy: 0.47 - valid accuracy: 0.345
epoch: 1 - (74 seconds) - train loss: 582.903 - valid loss: 35.079 - train accuracy: 0.564 - valid accuracy: 0.436
epoch: 2 - (74 seconds) - train loss: 545.586 - valid loss: 30.167 - train accuracy: 0.649 - valid accuracy: 0.6
epoch: 3 - (74 seconds) - train loss: 505.191 - valid loss: 31.636 - train accuracy: 0.748 - valid accuracy: 0.6
epoch: 4 - (74 seconds) - train loss: 481.32 - valid loss: 29.582 - train accuracy: 0.799 - valid accuracy: 0.6
epoch: 5 - (74 seconds) - train loss: 457.92 - valid loss: 31.81 - train accuracy: 0.849 - valid accuracy: 0.473
epoch: 6 - (74 seconds) - train loss: 451.2 - valid loss: 27.002 - train accuracy: 0.856 - valid accuracy: 0.8
epoch: 7 - (74 seconds) - train loss: 429.661 - valid loss: 27.862 - train accuracy: 0.896 - valid accuracy: 0.727
epoch: 8 - (74 seconds) - train loss: 420.68 - valid loss: 26.493 - train a

epoch: 71 - (74 seconds) - train loss: 384.125 - valid loss: 25.009 - train accuracy: 0.974 - valid accuracy: 0.836
epoch: 72 - (74 seconds) - train loss: 373.249 - valid loss: 25.916 - train accuracy: 0.999 - valid accuracy: 0.836
epoch: 73 - (74 seconds) - train loss: 372.052 - valid loss: 25.079 - train accuracy: 1.0 - valid accuracy: 0.836
epoch: 74 - (74 seconds) - train loss: 372.116 - valid loss: 27.268 - train accuracy: 1.0 - valid accuracy: 0.727
epoch: 75 - (74 seconds) - train loss: 389.214 - valid loss: 24.685 - train accuracy: 0.96 - valid accuracy: 0.836
epoch: 76 - (74 seconds) - train loss: 374.129 - valid loss: 24.21 - train accuracy: 0.998 - valid accuracy: 0.873
epoch: 77 - (74 seconds) - train loss: 374.424 - valid loss: 24.108 - train accuracy: 0.997 - valid accuracy: 0.873
epoch: 78 - (74 seconds) - train loss: 376.656 - valid loss: 27.055 - train accuracy: 0.99 - valid accuracy: 0.764
epoch: 79 - (74 seconds) - train loss: 378.315 - valid loss: 23.603 - train acc

## Inference of Validation Dataset

In [163]:
# Inference for validation dataset after tuning

# identify where the weights you want to load are 
weight_fil = "hpt_Adam_34/best_weights.pth"
es_epoch = 125

# set necessary hyperparameters
loss_func = nn.CrossEntropyLoss()
batch_size = 2

# load weights
model = torch.load(weight_fil)

# put model in evaluation mode (sets dropout and batch normalization layers to evaluation mode before running inference. Failing to do this will yield inconsistent inference results)
model.eval()

# This implementation use CUDA for gpu acceleration
# check if CUDA is available
cuda = torch.cuda.is_available()
if cuda:
    # set the network to use cuda
    model = model.cuda()
    print("CUDA IS AVAILABLE!")
else:
    print("CUDA NOT AVAILABLE!")

# create loaders to feed data to the network in batches
# image size is 640X480, which is larger than 224X224

# training dataset loader
eval_set = mydata('validation')

eval_loader = torch.utils.data.DataLoader(dataset = eval_set , batch_size= batch_size , shuffle = False)


# track metrics over dataset
eval_loss = 0.0
eval_accuracy = 0.0

# track true label and predicted label for each image
image_names = eval_set.image_names
true_labels = []
predicted_labels = []

# loop through eval data
for i, (images, labels) in enumerate(eval_loader):
    
    # for each batch, set images and labels to be cuda compatible
    images = Variable(images).cuda()
    labels = Variable(labels).cuda()
    
    # run the model on the eval batch
    outputs = model(images)
    outputs = torch.sigmoid(outputs)
    
    # compute eval loss
    loss = loss_func(outputs, labels)
    # track loss for testing dataset
    eval_loss += loss.item()
    
    # get testing accuracy
    for i, image_label in enumerate(outputs):
        temp_max, temp_index = image_label.max(0)
        
        true_labels.append(labels[i])
        predicted_labels.append(temp_index)
        
        if temp_index==labels[i]:
            eval_accuracy += 1
                
eval_accuracy = eval_accuracy/len(eval_set)

# save the inference accuracy
print(f'Early stop (epoch {es_epoch}, {weight_fil}): tuned accuracy for validation dataset is {eval_accuracy}',
      file=open("tuned_result_valid.txt", "a"))

CUDA IS AVAILABLE!


## Inference of Testing Dataset 

In [154]:
# Inference for testing dataset after tuning

# identify where the weights you want to load are 
weight_fil = "hpt_Adam_29/best_weights.pth"

# set necessary hyperparameters
loss_func = nn.CrossEntropyLoss()
batch_size = 2

# load weights
model = torch.load(weight_fil)

# put model in evaluation mode (sets dropout and batch normalization layers to evaluation mode before running inference. Failing to do this will yield inconsistent inference results)
model.eval()

# This implementation use CUDA for gpu acceleration
# check if CUDA is available
cuda = torch.cuda.is_available()
if cuda:
    # set the network to use cuda
    model = model.cuda()
    print("CUDA IS AVAILABLE!")
else:
    print("CUDA NOT AVAILABLE!")

# create loaders to feed data to the network in batches
# image size is 640X480, which is larger than 224X224

# training dataset loader
eval_set = mydata('test')

eval_loader = torch.utils.data.DataLoader(dataset = eval_set , batch_size= batch_size , shuffle = False)


# track metrics over dataset
eval_loss = 0.0
eval_accuracy = 0.0

# track true label and predicted label for each image
image_names = eval_set.image_names
true_labels = []
predicted_labels = []

# loop through eval data
for i, (images, labels) in enumerate(eval_loader):
    
    # for each batch, set images and labels to be cuda compatible
    images = Variable(images).cuda()
    labels = Variable(labels).cuda()
    
    # run the model on the eval batch
    outputs = model(images)
    outputs = torch.sigmoid(outputs)
    
    # compute eval loss
    loss = loss_func(outputs, labels)
    # track loss for testing dataset
    eval_loss += loss.item()
    
    # get testing accuracy
    for i, image_label in enumerate(outputs):
        temp_max, temp_index = image_label.max(0)
        
        true_labels.append(labels[i])
        predicted_labels.append(temp_index)
        
        if temp_index==labels[i]:
            eval_accuracy += 1
                
eval_accuracy = eval_accuracy/len(eval_set)
            
# save the inference loss and accuracy
print(f'{weight_fil}: \nTuned loss for testing dataset is {eval_loss}.\nTuned accuracy for testing dataset is {eval_accuracy}.\n',
      file=open("tuned_result.txt", "a"))


# save the inferenced labels and true labels
infer_list = list(zip(image_names, true_labels, predicted_labels))                 
infer_df   = pd.DataFrame(infer_list, columns = ['image name', 'true label', 'predicted label'])
infer_df.to_csv('inference_result_29_test.csv', header = True, index = False)

CUDA IS AVAILABLE!
