In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # stateless functions
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import sampler

import os
import pandas as pd
from skimage import io, transform
import numpy as np

import matplotlib.pyplot as plt

#import torchvision.datasets as dset
import torchvision.transforms as T

In [2]:
image_dir = '../data/train_images'
NUM_TRAIN = 6800 #8492

USE_GPU = False
dtype = torch.float32   # use float throughout the training
print_every = 10

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print('using device:', device)

using device: cpu


In [3]:
# Customized Dataset
class ProstateCancerDataset(Dataset):
    """Prostate Cancer Biopsy Dataset"""
    
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file
            root_dir (string): Path to the directory with all images
            transform (callable, optional): Optional transform to be applied on an image sample
        """
        # Shuffle dataframes with fixed seed
        self.cancer_df = pd.read_csv(csv_file).sample(frac=1, random_state=1)
        # Use DataLoader to shuffer data
        # self.cancer_df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.cancer_df)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, f'{self.cancer_df.iloc[idx, 0]}.tiff')
        # (D,W,H)
        #img = io.imread(img_path)
        img = io.MultiImage(img_path) # conserve_memory=True  Turn off to improve performance
        isup = self.cancer_df.iloc[idx, 2]
        gleason = self.cancer_df.iloc[idx, 3]
        # Recommend using downsample rate of 16 to speed up resizing
        sample = {'image': img[-1], 'isup_grade': isup, 'gleason_score': gleason}

        if self.transform:
            sample = self.transform(sample)
        return sample        

In [4]:
# Customize Transforms
class Rescale(object):
    """Rescale the image sample to the given size
    Args:
        output_size (tuple): Desired output size. Output is matched to output_size.
    """
    
    def __init__(self, output_size):
        assert isinstance(output_size, tuple)
        self.output_size = output_size
        
    def __call__(self, sample):
        img = transform.resize(sample['image'], self.output_size)
        return {'image': img, 'isup_grade': sample['isup_grade'], 'gleason_score': sample['gleason_score']}
    

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""
    
    def __call__(self, sample):
        img = sample['image']        
        # Swap color axis to [C,H,W]
        img = img.transpose(2,0,1)
        return {'image': img, 'isup_grade': sample['isup_grade'], 'gleason_score': sample['gleason_score']}

# Data Preparation

In [5]:
# Compse tranforms of rescale and totensor
# More transformer to try: crop, normalize, etc.
# And transformer is a useful tool for data augmentation
biopsy_train = ProstateCancerDataset(csv_file='train_512.csv',
                                     root_dir=image_dir,
                                     transform=T.Compose([
                                         Rescale((512, 512)),
                                         ToTensor()
                                     ]))

loader_train = DataLoader(biopsy_train, batch_size=8, num_workers=4,
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

loader_val = DataLoader(biopsy_train, batch_size=4, num_workers=4,
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, NUM_TRAIN+100)))  #8492  Use smaller size for debug

# Two-Layer Network

In [6]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
        for batch in loader:
            x = batch['image'].to(device=device, dtype=dtype)
            y = batch['isup_grade'].to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            #print(scores.max(1))
            #assert False
            num_correct += (preds==y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got {:d}/{:d} correct {:.2f}'.format(num_correct, num_samples, acc*100))

def flatten(x):
    N = x.shape[0] # read in N, C, H, W
    return x.view(N, -1)

def train_sequential(model, optimizer, epochs=1):
    """
    Train a model using PyTorch Sequential API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model.
    - epochs: The expected usage number of each image.
    
    Output: Print model accuracies.
    """
    model = model.to(device=device)
    for e in range(epochs):
        for t, batch in enumerate(loader_train):
            x = batch['image'].to(device=device, dtype=dtype)
            y = batch['isup_grade'].to(device=device, dtype=torch.long)
            
            scores = model(x)
            loss = F.cross_entropy(scores, y)
            
            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()
            
            # Backward pass: compute the gradient of the loss with respect to
            # each parameter of the model.
            loss.backward()
            
            # Update the parameters of the model using the gradients computed by
            # the backward pass.
            optimizer.step()
            
            if t % print_every == 0:
                print('Iteration {:d}, loss = {:.4f}'.format(t, loss.item()))
                check_accuracy(loader_val, model)

In [None]:
class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)
    
hidden_layer_size = 50
learning_rate = 1e-3

model = nn.Sequential(
    Flatten(),
    nn.Linear(3*512*512, hidden_layer_size),
    nn.ReLU(),
    nn.Linear(hidden_layer_size, 6)
)

# Use Nesterov momentum
optimizer = optim.SGD(model.parameters(),
                      lr=learning_rate,
                      momentum=.9,
                      nesterov=True)

train_sequential(model, optimizer, 2)

Iteration 0, loss = 1.8226
Got 32/100 correct 32.00
Iteration 10, loss = 1.7507
Got 13/100 correct 13.00
Iteration 20, loss = 1.7715
Got 13/100 correct 13.00
Iteration 30, loss = 1.7680
Got 13/100 correct 13.00
Iteration 40, loss = 1.8237
Got 13/100 correct 13.00
Iteration 50, loss = 1.8036
Got 13/100 correct 13.00
Iteration 60, loss = 1.7774
Got 13/100 correct 13.00
Iteration 70, loss = 1.7950
Got 13/100 correct 13.00
Iteration 80, loss = 1.7736
Got 13/100 correct 13.00
Iteration 90, loss = 1.7442
Got 32/100 correct 32.00
Iteration 100, loss = 1.8084
Got 32/100 correct 32.00
Iteration 110, loss = 1.7659
Got 32/100 correct 32.00
Iteration 120, loss = 1.7239
Got 32/100 correct 32.00
Iteration 130, loss = 1.7792
Got 32/100 correct 32.00
Iteration 140, loss = 1.7925
Got 32/100 correct 32.00
Iteration 150, loss = 1.7965
Got 32/100 correct 32.00
Iteration 160, loss = 1.6866
Got 32/100 correct 32.00
Iteration 170, loss = 1.7905
Got 32/100 correct 32.00
Iteration 180, loss = 1.6924
Got 32/100