In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
import os, os.path

%matplotlib inline

### Part 1: Visulize data

In [2]:
# #load images from sample folder
# imgs = []
# filenames = []
# path = "data/sample"
# valid_images = [".tif", ".jpg",".gif",".png",".tga"]
# for f in os.listdir(path):
#     fn = os.path.splitext(f)[0]
#     ext = os.path.splitext(f)[1]
#     if ext.lower() not in valid_images:
#         continue
#     imgs.append(Image.open(os.path.join(path,f)))
#     filenames.append(fn)
# print("length of imgs: {}".format(len(imgs)))
# #print(filenames)

# #get labels for these images
# labels = pd.read_csv("data/train_labels.csv")
# filelabels = []
# for fn in filenames:
#     filelabels.append(labels.loc[labels.id == fn, 'label'].values[0])
# #print(filelabels)


# #visulize loaded images
# fig, axes = plt.subplots(4, 4, figsize=(10, 12))
# for image, label, ax in zip(imgs, filelabels, axes.ravel()):
#     ax.imshow(image)
#     ax.set_title("label: {}".format(label))

### Part 2: train and test split, balance train data

In [36]:
# #train and test split
# from sklearn.model_selection import train_test_split
# data = pd.read_csv("data/train_labels.csv")
# print("data shape: {}".format(data.shape))
# print("positive sample number: {}".format(sum(data.label)))
# train_x, test_x, train_y, test_y = train_test_split(data.loc[:, 'id'], data.loc[:, 'label'], 
#                                                     test_size=0.05, random_state=16)
# train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, 
#                                                       test_size=0.05, random_state=16)
# train_data = pd.DataFrame({'id': train_x, 'label': train_y})
# valid_data = pd.DataFrame({'id': valid_x, 'label': valid_y})
# test_data = pd.DataFrame({'id': test_x, 'label': test_y})
# print("samples in train: {}".format(train_data.shape[0]))
# print("samples in validation: {}".format(valid_data.shape[0]))
# print("samples in test: {}".format(test_data.shape[0]))
# train_data.to_csv("data/train.csv")
# valid_data.to_csv("data/valid.csv")
# test_data.to_csv("data/test.csv")

data shape: (220025, 2)
positive sample number: 89117
samples in train: 198571
samples in validation: 10452
samples in test: 11002


In [4]:
# train = pd.read_csv("data/test.csv")
# train.loc[:, 'label'].value_counts()

### Part 3: load images as Pytorch Dataset

In [5]:
from skimage import io, transform
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import roc_curve, auc

In [6]:
class HistoDataset(Dataset):
    """kaggle histo dataset."""

    def __init__(self, csv_file, root_dir):
        """
        Args:
            csv_file (string): Path to the csv file with id and label.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.id = pd.read_csv(csv_file)
        self.root_dir = root_dir
        

    def __len__(self):
        return len(self.id)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.id.iloc[idx, 1]+'.tif')
        image = io.imread(img_name)
        label = self.id.iloc[idx, 2]         
        
        return image, label
    

In [41]:
def train_model(model, model_param, epoch_num, batch_size):
    #generate dataset
    train_dataset = HistoDataset("data/train.csv", "data/train")
    val_dataset = HistoDataset("data/valid.csv", "data/train")
    #load data
    train_data = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                            shuffle=True, num_workers=2)
    val_data = DataLoader(dataset=val_dataset, batch_size=batch_size, 
                            shuffle=True, num_workers=2)
    
    #setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
    #link mode to device
    model.to(device)
    #define loss function
    criterion = nn.BCELoss()
    #define optimizer
    optimizer = optim.SGD(model_param, lr=0.001, momentum=0.9)

    
    for epoch in range(epoch_num):
        model.train()
        with torch.set_grad_enabled(True):
            for i, (samples, labels) in tqdm(enumerate(train_data)):            
                inputs, labels = samples.view((samples.size(0),3, 96, 96)).float().to(device), labels.view(labels.size(0), 1).float().to(device)
                #print(labels)
                y_pred = model.forward(inputs)
                #print(y_pred)
                loss = criterion(y_pred, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        #evaluate the performance on validation set        
        pred_values = []
        true_values = []
        #set model in eval mode and shut down grad to save memory
        model.eval()
        with torch.no_grad():
            for j, (samples, labels) in enumerate(val_data):
                inputs = samples.view((samples.size(0),3, 96, 96)).float().to(device)
                pred_values += [x[0] for x in model.forward(inputs).cpu().data.numpy()]
                true_values += [x for x in labels.numpy()]
        #print(pred_values)
        #print(true_values)
        fpr, tpr, thres = roc_curve(np.array(true_values), np.array(pred_values))
        auc_score = auc(fpr, tpr)
        print("epoch: {}".format(epoch))
        print("auc: {}".format(auc_score))
        
        #save the model
        torch.save(model, "model/cnn_epoch_" + str(epoch + 1) + ".pkl")




In [8]:
class CNN(nn.Module):

    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=4, stride=4)
            )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=4, stride=4)
            )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=128,
                out_channels=256,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
            )

        self.fc = nn.Sequential(
            nn.Linear(3*3*256, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(1024, 1),
            nn.Sigmoid()
            )

    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        output = self.fc(x)
        return output



In [42]:
model = torch.load("model/cnn.pkl")
train_model(model, model.parameters(), 10, 32)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 
auc: 0.924113682175554


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


epoch: 
auc: 0.8895023863703181


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [43]:
#load model
model = torch.load("model/cnn.pkl")
model

CNN(
  (conv1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-

In [39]:
def compute_auc(model, csv_file_name, image_folder, batch_size):
    
    dataset = HistoDataset(csv_file_name, image_folder)    
    data = DataLoader(dataset=dataset, batch_size=batch_size, 
                            shuffle=False, num_workers=2)
    #setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    
    #evaluate the performance on validation set        
    pred_values = []
    true_values = []
    #set model in eval mode and shut down grad to save memory
    model.eval()
    with torch.no_grad():
        for j, (samples, labels) in enumerate(data):
            inputs = samples.view((samples.size(0),3, 96, 96)).float().to(device)
            pred_values += [x[0] for x in model.forward(inputs).cpu().data.numpy()]
            true_values += [x for x in labels.numpy()]
        #print(pred_values)
        #print(true_values)
    fpr, tpr, thres = roc_curve(np.array(true_values), np.array(pred_values))
    auc_score = auc(fpr, tpr)
    print("auc: {}".format(auc_score))
    
#     plt.plot(fpr, tpr, color='blue')
#     plt.xlabel("false positive rate", fontsize=14)
#     plt.ylabel("true positive rate", fontsize=14)
#     plt.title("roc", fontsize=18)

In [44]:
compute_auc(model, "data/valid.csv", "data/train", 32)
compute_auc(model, "data/test.csv", "data/train", 32)

auc: 0.9345478704533579
auc: 0.9312275002112591
