<a href="https://colab.research.google.com/github/vialucis/BreastCancerML/blob/master/Breast_Cancer_Final_Project_ResNet_Training_without_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Breast Cancer Using Image Classification without Validation



In [0]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
from torchvision import transforms, datasets, models

from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import matplotlib.pyplot as plt
import time
import copy
from tqdm.notebook import tqdm

print("PyTorch Version: ", torch.__version__)
print("Torchvision Version: ", torchvision.__version__)

PyTorch Version:  1.4.0
Torchvision Version:  0.5.0


In [0]:
# import data folder
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
PROJ_ROOT = "/content/drive/My Drive/Dataset(reshaped)/200X"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Mounted at /content/drive


## Load Data

Adapted from Github ML project https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb

In [0]:
def get_train_test_loader(data_dir, batch_size=32, test_percent = 0.1,
                          shuffle=True, random_seed = 432,
                          num_workers=2):
  """
  Loads and returns train and test DataLoaders (object of class
  torch.utils.data.DataLoader)

  Params
  ------
  - data_dir: path directory to the dataset
  - batch_size: how many samples per batch to load
  - test_percent: percentage test-train split
  - shuffle: whether to shuffle dataset
  - random_seed: fixed seed for shuffle
  - num_workers: number of subprocesses to use when loading the dataset

  Returns
  -------
  - train_loader: training set iterator
  - test_loader: test set iterator

  Adapted from https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb
  """

  assert ((test_percent >= 0) and (test_percent <= 1)),"Error! test_size should be in range [0, 1]."
  
  normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])
  # data normalization and augmentation for training
  train_transforms = transforms.Compose([
                      transforms.RandomResizedCrop(224),
                      transforms.RandomHorizontalFlip(),
                      transforms.ToTensor(),
                      normalize])
  # normalization for test
  test_transforms = transforms.Compose([
                      transforms.ToTensor(),
                      normalize])
  
  print("Initializing datasets and dataloaders...")

  # load datasets
  train_dataset = datasets.ImageFolder(root=data_dir, transform=train_transforms)
  test_dataset = datasets.ImageFolder(root=data_dir, transform=test_transforms)

  n = len(train_dataset)
  indices = list(range(n))
  split = int(np.floor(test_percent * n))

  # shuffle input images
  if shuffle:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
  
  # subset datasets
  train_idx, test_idx = indices[split:], indices[:split]
  train_sampler = SubsetRandomSampler(train_idx)
  test_sampler = SubsetRandomSampler(test_idx)

  # create dataloaders
  train_loader = DataLoader(train_dataset, batch_size=batch_size,
                            sampler=train_sampler, num_workers=num_workers)
  test_loader = DataLoader(test_dataset, batch_size=batch_size,
                           sampler=test_sampler, num_workers=num_workers)

  print("Total number of images:", n)
  print("Number of test images:", len(test_sampler))
  print("Number of train images:", len(train_sampler))
  
  return train_loader, test_loader

In [0]:
# load datasets and dataloaders
train_loader, test_loader = get_train_test_loader(PROJ_ROOT)

## ResNet Training Function

In [0]:
def train_model(model, dataloader, criterion, optimizer, num_epochs):
  """
  Trains a DL model for a number of epochs. Does not return anything

  Params
  ------
  - model: deep learning neural network
  - dataloader: training set iterator of PyTorch object class DataLoader
  - criterion: loss function
  - optimizer: 
  - num_epochs: number of epochs
  """
  begin = time.time()

  #best_model_wts = copy.deepcopy(model.state.dict())
  #best_acc = 0.0

  pbar_outer = tqdm(range(num_epochs), leave=True)
  for epoch in pbar_outer:
    pbar_outer.set_description("Epoch {}/{}".format(epoch, num_epochs-1))
    print('-' * 10)

    # set model to training mode
    model.train()
    running_loss = 0.0
    running_corrects = 0

    # Iterate over data
    pbar_inner = tqdm(enumerate(dataloader), leave=False)
    for batch_idx, (inputs, labels) in pbar_inner:
      pbar_inner.set_description("Iterating over batch {}/{}".format(batch_idx, len(dataloader)))
      inputs = inputs.to(device)
      labels = labels.to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      # forward
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      _, preds = torch.max(outputs, 1)

      # backward propand optimize
      loss.backward()
      optimizer.step()

      # running counts
      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)
  
    # average loss over entire dataset length rather than batch size to reduce
    # bias (ie. dataset length not evenly divisible by batch size)
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = running_corrects.double() / len(dataloader.dataset)
    print('Training Loss: {:.4f} | Accuracy: {:.4f}\n'.format(epoch_loss, epoch_acc))

  time_elapsed = time.time() - begin
  print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60,
                                                      time_elapsed % 60))
  
  return






## Initialize and Reshape ResNet152
Adapted from PyTorch Tutorial on "Finetuning Torchvision Models" by Nathan Inkawhich:
https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html

In [0]:
def initialize_model(num_classes, use_pretrained=False, feature_extracting=False):
  """
  Initializes PyTorch ResNet152 model.

  Params
  ------
  - num_classes: number of categories in image dataset
  - use_pretrained: whether to use model pretrained on ImageNet
  - feature_extracting: whether finetuning model or feature extracting

  Returns
  -------
  - model_ft: reshaped ResNet152 model
  """

  model_ft = models.resnet152(pretrained=use_pretrained)
  set_parameters_requires_grad(model_ft, feature_extracting)
  num_ftrs = model_ft.fc.in_features
  model_ft.fc = nn.Linear(num_ftrs, num_classes)

  return model_ft


def set_parameters_requires_grad(model, feature_extracting):
  """
  Sets model to only compute gradients for newly initialized layer.
  """
  if feature_extracting:
    for param in model.parameters():
      param.requires_grad = False



## Run Training
1. Load data
2. Initialize model
3. Define optimizer and loss function
4. Run training
5. Evaluate

### Define Hyperparameters

In [0]:
num_classes = 1
num_epochs = 15
learning_rate = 0.001
momentum = 0.9

criterion = nn.CrossEntropyLoss()

### Not-Pretrained ResNet

In [0]:
# initialize ResNet model
# untrained_model = initialize_model(num_classes)
untrained_model = models.resnet152(pretrained=False).cuda()
untrained_model = untrained_model.to(device).cuda()
# print(untrained_model)

# create optimizer and loss function
untrained_optimizer = optim.SGD(untrained_model.parameters(), lr=learning_rate,
                                momentum=momentum)

# train model
train_model(untrained_model, train_loader, criterion, untrained_optimizer, num_epochs)

### Pretrained Finetuning ResNet

In [0]:
# initialize ResNet finetuning model
pretrained_model_ft = initialize_model(num_classes, use_pretrained=True).cuda()
pretrained_model_ft = pretrained_model_ft.to(device).cuda()
# print(pretrained_model_ft)

# create optimizer and loss function
pretrained_optimizer_ft = optim.SGD(pretrained_model_ft.parameters(), lr=learning_rate,
                                    momentum=momentum)

# train model
train_model(pretrained_model_ft, train_loader, criterion, pretrained_optimizer_ft, num_epochs)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 1.3556 | Accuracy: 0.4460

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.8765 | Accuracy: 0.5970

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.6210 | Accuracy: 0.6861

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.4950 | Accuracy: 0.7238

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.3924 | Accuracy: 0.7629

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.3351 | Accuracy: 0.7787

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.2816 | Accuracy: 0.7946

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.2602 | Accuracy: 0.8084

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.2347 | Accuracy: 0.8153

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.2326 | Accuracy: 0.8173

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.1866 | Accuracy: 0.8322

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.1732 | Accuracy: 0.8332

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.1705 | Accuracy: 0.8332

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.1430 | Accuracy: 0.8475

----------


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training Loss: 0.1443 | Accuracy: 0.8465


Training complete in 10m 47s


### Pretrained Feature Extraction ResNet

In [0]:
# initialize ResNet feature extraction model
pretrained_model_fext = initialize_model(num_classes, use_pretrained=True, feature_extracting=True)
pretrained_model_fext = pretrained_model_fext.to(device)
# print(pretrained_model_fext)

# create optimizer
params_to_update = []
for name, param in pretrained_model_fext.named_parameters():
  if param.requires_grad == True:
    params_to_update.append(param)
    # print("\t", name)
pretrained_optimizer_fext = optim.SGD(params_to_update, lr=learning_rate,
                                      momentum=momentum)

# train model
best_pretrained_fext, pretrained_fext_acc_hist = train_model(pretrained_model_fext, train_loader,
                                                             criterion, pretrained_optimizer_fext, num_epochs)