In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import os
import numpy as np
from torch.utils import data
import torchvision
from torchvision import transforms, datasets,models
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from torchsummary import summary
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable
from scipy import spatial
from sklearn.neighbors import BallTree
from scipy.stats import mode
import torch.nn.functional as F
import os, time
import sys
sys.path.append("/storage/groups/qscd01/projects/HistologyColourNorm/MDMMsrc")
%matplotlib inline

ModuleNotFoundError: No module named 'torchsummary'

In [4]:
from pytorch_metric_learning import losses, miners, samplers, trainers, testers
from pytorch_metric_learning.utils import common_functions
import pytorch_metric_learning.utils.logging_presets as logging_presets
from pytorch_metric_learning.utils.inference import MatchFinder, InferenceModel
from pytorch_metric_learning.utils.accuracy_calculator import AccuracyCalculator
import numpy as np
import torchvision
from torchvision import datasets, transforms
import torch
import torch.nn as nn
from PIL import Image
import logging
import matplotlib.pyplot as plt
import umap
from cycler import cycler
import record_keeper
import pytorch_metric_learning
logging.getLogger().setLevel(logging.INFO)
logging.info("VERSION %s"%pytorch_metric_learning.__version__)

INFO:root:VERSION 0.9.87.dev3


# Experiments
We'll go through learning feature embeddings using different loss functions on TCGA-Colon dataset for classification and Hashing

For every experiment the same embedding network is used and we don't do any hyperparameter search.

# Simple Model definition

In [5]:
class MLP(nn.Module):
    # layer_sizes[0] is the dimension of the input
    # layer_sizes[-1] is the dimension of the output
    def __init__(self, layer_sizes, final_relu=False):
        super().__init__()
        layer_list = []
        layer_sizes = [int(x) for x in layer_sizes]
        num_layers = len(layer_sizes) - 1
        final_relu_layer = num_layers if final_relu else num_layers - 1
        for i in range(len(layer_sizes) - 1):
            input_size = layer_sizes[i]
            curr_size = layer_sizes[i + 1]
            if i < final_relu_layer:
                layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.Linear(input_size, curr_size))
        self.net = nn.Sequential(*layer_list)
        self.last_linear = self.net[-1]

    def forward(self, x):
        return self.net(x)

# Initialize models, optimizers and image transforms

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set trunk model and replace the softmax layer with an identity function
trunk = torchvision.models.resnet18(pretrained=True)
trunk_output_size = trunk.fc.in_features
trunk.fc = common_functions.Identity()
trunk = torch.nn.DataParallel(trunk.to(device))

# Set embedder model. This takes in the output of the trunk and outputs 64 dimensional embeddings
embedder = torch.nn.DataParallel(MLP([trunk_output_size, 64]).to(device))

# Set the classifier. The classifier will take the embeddings and output a 50 dimensional vector.
# (Our training set will consist of the 9 classes)
# We'll specify the classification loss further down in the code.
classifier = torch.nn.DataParallel(MLP([64, 9])).to(device)

# Set optimizers
trunk_optimizer = torch.optim.Adam(trunk.parameters(), lr=0.00001, weight_decay=0.0001)
embedder_optimizer = torch.optim.Adam(embedder.parameters(), lr=0.0001, weight_decay=0.0001)
classifier_optimizer = torch.optim.Adam(classifier.parameters(), lr=0.0001, weight_decay=0.0001)

mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

# Set the image transforms
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
#         transforms.RandomRotation(30),
#         transforms.CenterCrop(28),        
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ]),
    'val': transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])}

def print_decision(is_match):
    if is_match:
        print("Same class")
    else:
        print("Different class")

In [14]:
from torchsummary import summary
summary(trunk,(3, 224,224))
summary(embedder,(1,512))
summary(classifier,(1,64))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

# Create the dataset and class-disjoint train/val splits

Split training, validation and test data

In [6]:
class Args :
    a2b=1
    concat= 1
    crop_size= 512
    dis_norm= None
    dis_scale= 3
    dis_spectral_norm= False
    display_dir= 'logs'
    display_freq= 1
    dataroot='/storage/groups/qscd01/projects/HistologyColourNorm/DRIT-master/datasets/Hist1/'
    gpu= 0
    input_dim =3
    isDcontent=False
    nThreads= 4
    name='Histology2'
    num= 2
    num_domains= 5
    phase= 'test'
    resize_size= 1000
    result_dir= '/storage/groups/qscd01/projects/HistologyColourNorm/outputs'
    resume='/storage/groups/qscd01/projects/HistologyColourNorm/MDMMsrc/results/Histology/00004.pth'
opts = Args()

In [23]:
from options import TestOptions
from datasets import dataset_single
from model import MD_multi
from saver import save_imgs, save_concat_imgs

# model
print('\n--- load model ---')
model = MD_multi(opts)
model.resume(opts.resume, train=False)
for p in model.parameters():
    p.requires_grad = False
model.cuda()
model.eval()


--- load model ---
<generator object Module.parameters at 0x7fb87df79350>


MD_multi(
  (dis1): MD_Dis(
    (model): Sequential(
      (0): LeakyReLUConv2d(
        (model): Sequential(
          (0): ReflectionPad2d((1, 1, 1, 1))
          (1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2))
          (2): LeakyReLU(negative_slope=0.01, inplace=True)
        )
      )
      (1): LeakyReLUConv2d(
        (model): Sequential(
          (0): ReflectionPad2d((1, 1, 1, 1))
          (1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2))
          (2): LeakyReLU(negative_slope=0.01, inplace=True)
        )
      )
      (2): LeakyReLUConv2d(
        (model): Sequential(
          (0): ReflectionPad2d((1, 1, 1, 1))
          (1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2))
          (2): LeakyReLU(negative_slope=0.01, inplace=True)
        )
      )
      (3): LeakyReLUConv2d(
        (model): Sequential(
          (0): ReflectionPad2d((1, 1, 1, 1))
          (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2))
          (2): LeakyReLU(negative_slope=0

In [31]:
import glob
general_transforms = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

class PandasDataset(torch.utils.data.Dataset):

    def __init__(self,root_dir,transform=None):
        """
        Args:
            text_file(string): path to text file
            root_dir(string): directory with all train images
        """
        image_names = glob.glob(os.path.join(root_dir,'image*.jpg'))
        self.name_frame = [image_name.split('/')[-1] for image_name in image_names]
        self.root_dir = root_dir
        if transform:
            self.transform = transform
        else:
            self.transform = transforms.ToTensor()

    def __len__(self):
        return len(self.name_frame)

    def __getitem__(self, idx):
        img_name = self.name_frame[idx]
        image = Image.open(os.path.join(self.root_dir,img_name)).convert("RGB")
        image = self.transform(image)

        return image
    
def load_image(path):
    im_file = Image.open(path)
    im_file_arr = np.asarray(im_file)[:,:,:3]
    im_file = Image.fromarray(im_file_arr)
    im_file = general_transforms(im_file)
    

    return im_file

def analyze_folder(folder):
    columns = ['name']
    feature_dframe = pd.DataFrame(columns=columns)
    
    # first iterate over all names
    file_list = os.listdir(folder)
    
    for im_path in file_list:
        if not ('.jpg' in im_path):
            continue
        if 'mask' in im_path:
            continue
        
        new_row_append = {'name':im_path}
        feature_dframe=feature_dframe.append(new_row_append, ignore_index=True)
        
    # then load all images and extract features
    feature_arr = np.zeros((len(feature_dframe), 256))
    filled_in = 0
    for im_path in file_list:
        if not ('.jpg' in im_path):
            continue
        if 'mask' in im_path:
            continue
        image = load_image(os.path.join(folder, im_path))
        features = content_code(image.unsqueeze(0))
        feature_arr[filled_in, :] = features.numpy()

        filled_in += 1
    return feature_dframe, feature_arr

def analyze_folder_batchwise(folder,batch_size=16):
    columns = ['name']
    feature_dframe = pd.DataFrame(columns=columns)
    
    # first iterate over all names
    train_dataset =  PandasDataset(folder,transform = general_transforms)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = batch_size, shuffle=False,
        num_workers=4, pin_memory=True)
    
    feature_dframe['name']=train_dataset.name_frame
        
    # then load all images and extract features
    feature_arr = np.zeros((len(train_dataset), 256))
    filled_in = 0
    for i, images in enumerate(train_loader):
        features = model.enc_c.forward(images.cuda())
        if filled_in<len(train_dataset)-batch_size:
            feature_arr[filled_in:filled_in+batch_size, :] = features.reshape((batch_size,-1,128*128)).mean(dim=2).cpu().numpy()
            filled_in += batch_size
        else:
            feature_arr[filled_in:, :] = features.reshape((features.size()[0],-1,128*128)).mean(dim=2).cpu().numpy()
    return feature_dframe, feature_arr

In [33]:
# launch feature extraction and save it
super_directory = '/storage/groups/qscd01/datasets/panda_2/sample_data'
super_directory_save = '/storage/groups/qscd01/datasets/panda_2/mdmm_feature'

for patient in os.listdir(os.path.join(super_directory))[0:1]:
    time_before = time.time()
    
    feature_dframe, feature_arr = analyze_folder_batchwise(os.path.join(super_directory, patient))
    save_dir = os.path.join(super_directory_save, patient)
    if not os.path.exists(save_dir):
        os.mkdir(os.path.join(save_dir))
    feature_dframe.to_csv(os.path.join(save_dir, "file_order.csv"), index=False)
    np.save(os.path.join(save_dir, "features.npy"), feature_arr)
    
    time_after = time.time()
    print("---- patient: " + patient + " with " + str(len(feature_dframe)) + " images in " + str(time_after-time_before) + "s")

---- patient: f41758ea4ca2fa1520cfe98a4c064e54 with 216 images in 5.393353700637817s
