<a href="https://colab.research.google.com/github/veerendra12/CS598-DL4H-Project/blob/main/notebooks/TrainEvalUtil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time

import numpy as np
import pandas as pd

import torch
import torch.optim as optim

import matplotlib.pyplot as plt
import sklearn.metrics as sklm

import import_ipynb
from Utils import get_device, memory_report
from Configuration import CONFIG
from Utils import save_checkpoint

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, checkpoint_prefix):
    """
    Args:
    model: A CNN model
    train_dataloader: the DataLoader of the training data
    n_epoch: number of epochs to train
    optimizer: optimizer for training
    criterion: Loss function
    Return:
        model: trained model 
    
    """
    DEVICE = get_device()
    NUM_EPOCHS = CONFIG['NUM_EPOCHS']
    # When resuming a partial training session
    EPOCH_START = CONFIG['EPOCH_START']
    BASE_DIR = CONFIG['BASE_DIR']

    train_epoch_losses, validate_epoch_losses = [], [] 
    memories = {'cpu-ram-free': [],
                'cuda-memory-allocated': [],
                'cuda-memory-cached': [],
                'gpu-0-mem-free': [],
                'gpu-0-mem-total': [],
                'gpu-0-mem-util': []
                }    
    train_times = []
    validate_times = []


    if CONFIG['RESUME_TRAINING']:
      print("Loading earlier run summary data for resuming the training")
      prev_summary_df = pd.read_csv(CONFIG['BASE_DIR'] + 'results/summary.csv') 

      train_epoch_losses.append(prev_summary_df['Training Loss'])
      validate_epoch_losses.append(prev_summary_df['Validation Loss'])

      train_times.append(prev_summary_df['Training Time (mns)'])
      validate_times.append(prev_summary_df['Validation Time (mns)'])

      memories['cpu-ram-free'].append(prev_summary_df['cpu-ram-free'])
      memories['cuda-memory-allocated'].append(prev_summary_df['cuda-memory-allocated'])
      memories['cuda-memory-cached'].append(prev_summary_df['cuda-memory-cached'])
      memories['gpu-0-mem-free'].append(prev_summary_df['gpu-0-mem-free'])
      memories['gpu-0-mem-total'].append(prev_summary_df['gpu-0-mem-total'])
      memories['gpu-0-mem-util'].append(prev_summary_df['gpu-0-mem-util'])
    
    for epoch in range(EPOCH_START, NUM_EPOCHS):
        epoch_start_time = time.time()

        model.train()
        train_row={}
        train_curr_epoch_loss = 0
        for data in train_dataloader:
            inputs = data[0].to(DEVICE)
            labels = data[1].to(DEVICE)            
            optimizer.zero_grad()
            y_hat = model(inputs)
            y_hat = y_hat.to(DEVICE)
            labels = labels.type(torch.FloatTensor)
            labels = labels.to(DEVICE)
            loss = criterion(y_hat, labels)
            loss.backward()
            optimizer.step()

            train_curr_epoch_loss += loss.cpu().data.numpy()

        train_curr_epoch_loss = train_curr_epoch_loss / len(train_dataloader)
        train_epoch_losses.append(train_curr_epoch_loss)
        train_time = time.time() - epoch_start_time
        train_times.append(round(train_time/60, 4))

        print(f"Epoch {epoch}: Train curr_epoch_loss={train_curr_epoch_loss}")
        print('Training complete in {:.0f}m {:.0f}s'.format(train_time // 60, train_time % 60))
        
        epoch_start_time = time.time()

        
        model.eval()
        
        validate_curr_epoch_loss = 0
        for i, data in enumerate(val_dataloader):    
            inputs = data[0].to(DEVICE)
            labels = data[1].to(DEVICE)            
            labels = labels.type(torch.FloatTensor)
            labels = labels.to(DEVICE)
            true_labels = labels.cpu().data.numpy()            
            y_hat = model(inputs)
            y_hat = y_hat.to(DEVICE)
            probs = y_hat.cpu().data.numpy()                            
            loss = criterion(y_hat, labels)

            validate_curr_epoch_loss += loss.cpu().data.numpy()

        validate_curr_epoch_loss = validate_curr_epoch_loss / len(val_dataloader)
        validate_epoch_losses.append(validate_curr_epoch_loss)
        validation_time = time.time() - epoch_start_time
        validate_times.append(round(validation_time/60, 4))

        print(f"Epoch {epoch}: Validate curr_epoch_loss={validate_curr_epoch_loss}")
        print('Validation complete in {:.0f}m {:.0f}s'.format(validation_time // 60, validation_time % 60))
        
        memory = memory_report()
        memories['cpu-ram-free'].append(memory['cpu-ram-free'])
        memories['cuda-memory-allocated'].append(memory['cuda-memory-allocated'])
        memories['cuda-memory-cached'].append(memory['cuda-memory-cached'])
        memories['gpu-0-mem-free'].append(memory['gpu-0-mem-free'])
        memories['gpu-0-mem-total'].append(memory['gpu-0-mem-total'])
        memories['gpu-0-mem-util'].append(memory['gpu-0-mem-util'])    

        results = pd.DataFrame({'Iteration': range(len(train_epoch_losses)), 
                                'Training Loss': train_epoch_losses,
                                'Validation Loss': validate_epoch_losses,
                                'Training Time (mns)': train_times,
                                'Validation Time (mns)': validate_times,
                                'cpu-ram-free': memories['cpu-ram-free'],
                                'cuda-memory-allocated': memories['cuda-memory-allocated'],
                                'cuda-memory-cached': memories['cuda-memory-cached'],
                                'gpu-0-mem-free': memories['gpu-0-mem-free'],
                                'gpu-0-mem-total': memories['gpu-0-mem-total'],
                                'gpu-0-mem-util': memories['gpu-0-mem-util']})
        results.to_csv(BASE_DIR + 'results/summary.csv')  

        # save model
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer":optimizer.state_dict(),
        }
        save_checkpoint(checkpoint, BASE_DIR + "results/" + checkpoint_prefix + str(epoch) + ".pth")          
        
    return model, results



In [None]:
def eval_model(model, dataset, dataloader):
    """
    Use the trained/best model and valuate on validation dataset
    Args:
    model: trained model
    dataloader: validation dataloader
    :return:
        Y_pred: prediction of model on the dataloder.
        Y_test: truth labels. 
    """
    DEVICE = get_device()
    BATCH_SIZE = CONFIG['BATCH_SIZE']
    CLASS_LABELS = CONFIG['CLASS_LABELS']

    model.eval()

    pred_df = pd.DataFrame(columns=["Image Index"])
    true_df = pd.DataFrame(columns=["Image Index"])
    for i, data in enumerate(dataloader):
        inputs = data[0].to(DEVICE)
        labels = data[1].to(DEVICE)
        true_labels = labels.cpu().data.numpy()
        batch_size = true_labels.shape
        y_hat = model(inputs)
        probs = y_hat.cpu().data.numpy()

        for j in range(0, batch_size[0]):
            thisrow = {}
            truerow = {}
            thisrow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]
            truerow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]


            for k in range(len(CLASS_LABELS)):
                thisrow["prob_" + CLASS_LABELS[k]] = probs[j, k]
                truerow[CLASS_LABELS[k]] = true_labels[j, k]

            pred_df = pred_df.append(thisrow, ignore_index=True)
            true_df = true_df.append(truerow, ignore_index=True)

    auc_df = compute_auc(pred_df, true_df)

    return pred_df, true_df, auc_df

In [None]:
def compute_auc(pred_df, true_df):
  """
  Get Prediction and True values for Validation dataset and generate AUC Vaules for Validation dataset
  """
  BASE_DIR = CONFIG['BASE_DIR']
  CLASS_LABELS = CONFIG['CLASS_LABELS']
  RUN_PREFIX = CONFIG['RUN_PREFIX']
  auc_df = pd.DataFrame(columns=["label", "auc"])

  for column in true_df:

      if column not in CLASS_LABELS:
          continue
      actual = true_df[column]
      pred = pred_df["prob_" + column]
      thisrow = {}
      thisrow['label'] = column
      thisrow['auc'] = np.nan
      try:
          thisrow['auc'] = sklm.roc_auc_score(
          actual.values.astype(int), pred.values)

          
      except BaseException as e:
          print("can't calculate auc for " + str(column))
          print(e)
      auc_df = auc_df.append(thisrow, ignore_index=True)


  pred_df.to_csv(BASE_DIR + "results/" + RUN_PREFIX + "_preds.csv", index=False)
  auc_df.to_csv(BASE_DIR + "results/" + RUN_PREFIX + "_aucs.csv", index=False)
  true_df.to_csv(BASE_DIR + "results/" + RUN_PREFIX + "_true.csv", index=False)
  print(auc_df)
  return auc_df  

In [None]:
def plot_epoch_loss(results):
  loss_train = results['Training Loss']
  loss_val = results['Validation Loss']
  epochs = results['Iteration']
  plt.plot(epochs, loss_train, 'g', label='Training loss')
  plt.plot(epochs, loss_val, 'b', label='validation loss')
  plt.title('Training and Validation loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()