In [1]:
# Sheldon Gu
# Feb 17, 2019
# copy from kaggle kernel v13
# with modificatio the path of files
# shrink the net size
# save model to local folder


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from PIL import Image
import os, os.path
import matplotlib.pyplot as plt


from skimage import io, transform
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

#pytorch dataset class
class Dataset(Dataset):
    """kaggle histo dataset."""

    def __init__(self, dataframe, root_dir):
        """
        Args:
            dataframe: contain id and label.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.id = dataframe
        self.root_dir = root_dir
        

    def __len__(self):
        return len(self.id)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.id.iloc[idx, 0]+'.tif')
        image = io.imread(img_name)
        #crop image at the center part
        image = image[16:80, 16:80]
        #resize image to (224 x 224) to match pretrained model
        #image = transform.resize(image, (224, 224), mode="constant") 
        label = self.id.iloc[idx, 1]         
        
        return image, label
    

# a simple cnn with three convolutional layers and a fully connected layer    
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=4, stride=4)
            )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
            )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=128,
                out_channels=256,
                kernel_size=3,
                stride=1,
                padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
            )

        self.fc = nn.Sequential(
            nn.Linear(4*4*256, 4096),
            nn.BatchNorm1d(4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(1024, 1),
            nn.Sigmoid()
            )

    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        output = self.fc(x)
        return output

#train the model using train data, leave about 5% for validation
def train_model(model, parameters, train_rate, epoch_num, batch_size):
    #generate dataset
    #read train labels from file
    data = pd.read_csv("data/train_labels.csv")
    #split for train and validation
    train_df = data.iloc[:int(len(data)*0.95), :]
    val_df = data.iloc[int(len(data)*0.95):, :] 
    #convert to dataset
    train_dataset = Dataset(train_df, "data/train")
    val_dataset = Dataset(val_df, "data/train")
    #load data using DataLoader
    train_data = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                            shuffle=True, num_workers=0)
    val_data = DataLoader(dataset=val_dataset, batch_size=batch_size, 
                            shuffle=False, num_workers=0)
    
    #setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
    #link mode to device
    model.to(device)
    #define loss function
    criterion = nn.BCELoss()
    #define optimizer
    optimizer = optim.SGD(parameters, lr=train_rate, momentum=0.9)

    
    for epoch in range(epoch_num):
        model.train()
        with torch.set_grad_enabled(True):
            for i, (samples, labels) in tqdm(enumerate(train_data)):            
                #inputs, labels = samples.view((samples.size(0),3, 72, 72)).float().to(device), labels.view(labels.size(0), 1).float().to(device)
                inputs, labels = samples.permute(0, 3, 2, 1).float().to(device), labels.view(labels.size(0), 1).float().to(device)
                #print(labels)
                y_pred = model.forward(inputs)
                #print(y_pred)
                loss = criterion(y_pred, labels)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        #evaluate the performance on validation set        
        pred_values = []
        true_values = []
        #set model in eval mode and shut down grad to save memory
        model.eval()
        with torch.no_grad():
            for j, (samples, labels) in enumerate(val_data):
                inputs = samples.permute(0, 3, 2, 1).float().to(device)
                pred_values += [x[0] for x in model.forward(inputs).cpu().data.numpy()]
                true_values += [x for x in labels.numpy()]
        #print(pred_values)
        #print(true_values)
        fpr, tpr, thres = roc_curve(np.array(true_values), np.array(pred_values))
        auc_score = auc(fpr, tpr)
        print("epoch: {}".format(epoch+1))
        print("auc: {}".format(auc_score))
        
        #save the model
        torch.save(model, "model/cnn_epoch_" + str(epoch + 1) + ".pkl")

#prediction and save to file
def predict_prob(model):
    df = pd.read_csv("data/sample_submission.csv")
    dataset = Dataset(df, "data/test")    
    data = DataLoader(dataset=dataset, batch_size=32, 
                            shuffle=False, num_workers=0)
    #setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    
    #evaluate the performance on validation set        
    pred_values = []    
    #set model in eval mode and shut down grad to save memory
    model.eval()
    with torch.no_grad():
        for j, (samples, labels) in tqdm(enumerate(data)):
            #print(ids)
            inputs = samples.permute(0, 3, 2, 1).float().to(device)
            pred_values += [x[0] for x in model.forward(inputs).cpu().data.numpy()]          
    df.label = pred_values
    print(df.head())
    df.to_csv("sample_submission.csv", index=False)



In [2]:
# model = CNN() 
# train_model(model, model.parameters(), 0.001, 30, 32)
# the best model (epoch 15) auc 0.96 was stored and further tuned

In [4]:
model = torch.load("model/cnn.pkl")
#train_model(model, model.parameters(), 0.0001, 30, 32)
#train_model(model, model.parameters(), 0.00001, 30, 32)
predict_prob(model)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

                                         id     label
0  0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5  0.159315
1  95596b92e5066c5c52466c90b69ff089b39f2737  0.627113
2  248e6738860e2ebcf6258cdc1f32f299e0c76914  0.000048
3  2c35657e312966e9294eac6841726ff3a748febf  0.041215
4  145782eb7caa1c516acbe2eda34d9a3f31c41fd6  0.000030
