In [None]:
%matplotlib inline
# Script to consolidate abs path of CAMELYON16 data, view them with summary on dimensions, and do basic crop to

# camelyon object will contain 3 ordered list of tumor, normal, and test data
# each orderd list has structure {(<img label>,(<path_to_img>,<path_to_mask or None>))}
# initializtaion will dump paths according to shuffle/split into 2 lists 'listTrain' and 'listValid'
# CAMELYON16 tiffs have multiple resolution levels, accessible & segmentable by openslide lib
# CAMELYON16 pic proportion & tissue position in each image varies & probably needs to be preprocessed
import os, re, csv, collections
import numpy as np
import matplotlib.pyplot as plt
import openslide
import scipy.ndimage as ndimage

def GetImage(path,level):
    oslideimg = openslide.OpenSlide(path)
    assert (level < oslideimg.level_count), "level > level_count"
    img = oslideimg.read_region((0,0),level%oslideimg.level_count,oslideimg.level_dimensions[level])
    attr = [(oslideimg.level_dimensions[ll],oslideimg.level_downsamples[ll]) for ll in range(0,oslideimg.level_count)]
    return img, attr

def CropImageRatio(image_in,w_min,h_min,w_max,h_max):
    wd, hd = image_in.size
    w1, w2 = int(w_min*wd + 0.5), int(w_max*wd + 0.5)
    h1, h2 = int(h_min*hd + 0.5), int(h_max*hd + 0.5)
    return image_in.crop((w1,h1,w2,h2))

# Takes an nparray 3-channel image & returns the ratio along x and y axis for crop
# Needs skimage, numpy
def camelyon16_crop(image_in,mask_threshold=0.007,gb_sigma=30):
    img_in = np.array(image_in)
    assert img_in.shape[2] == 4, "camelyon16_crop: Wrong # channels in input image"
    img_in = img_in[:,:,0:3]
    # Get average intensity of 3 channels from original color image
    img_avg = np.average(img_in,axis=2)
    img_avg = np.repeat(img_avg[:,:,np.newaxis],3,axis=2)
    # Get abs difference between the color image and averaged intensity image
    # Any bright areas after this => non-neutral color areas
    img_diff = np.absolute(img_in - img_avg)
    img_diff = np.sum(img_diff,axis=2)/340.0 # Max diff is 340 (1 saturated channel)

    # After getting colored areas, blur it to get selection blob
    # img_mask = filters.gaussian(img_diff,sigma=gb_sigma,preserve_range=True)
    img_mask = ndimage.filters.gaussian_filter(img_diff,sigma=gb_sigma)
    # Paremetrized selection is for blobs with intensity of 0.007/1.0 or more
    img_mask = img_mask > mask_threshold

    # Get min & max coordinate along x & y axis, expressed in terms of fraction of img dim
    xx = np.nonzero(np.sum(img_mask,axis=0))
    yy = np.nonzero(np.sum(img_mask,axis=1))
    ydim, xdim = img_mask.shape
    xx_min, xx_max = 1.0*np.min(xx)/xdim, 1.0*np.max(xx)/xdim
    yy_min, yy_max = 1.0*np.min(yy)/ydim, 1.0*np.max(yy)/ydim
    
    return (xx_min, xx_max, yy_min, yy_max)

class CAMELYON16dataset():
    # Directory structure of folder
    directory = {'test_tumormask':'Testset/Ground_Truth/Masks',
                'test_tumor':'Testset/Images',
                'train_tumormask':'TrainingData/Ground_Truth/Mask',
                'train_tumor':'TrainingData/Train_Tumor',
                'train_normal':'TrainingData/Train_Normal',
                'test_attributes':'Testset/Ground_Truth/GT.csv'}
    trainIndex = 0
    validIndex = 0
    listFolder = []
    listTrain = []
    listValid = []
    
    def __init__(self, parentdir):
        assert os.path.exists(parentdir), "no such path directory"
        self.parentdir = parentdir
        self.absdir = None
        
    def AbleToRetrieveData(self):
            _notfound = [path for path in self.absdir.values() if os.path.exists(path)==False]
            if len(_notfound) > 0:
                print "Some paths were not found:"
                for path in _notfound:
                    print path
                return False
            else:
                return True

    def InitDataset(self,splitRatio=1.0, shuffle=False, sets=''):
        np.random.seed(0)
        # Get abs path of folders where images are stored
        self.absdir = {k:os.path.join(self.parentdir,v) for k,v in self.directory.iteritems()}
        assert self.AbleToRetrieveData(), "not able to retrieve data from path."
        
        # Match tumors & mask for training set - list by full path. All tumor images must have mask
        # key = lowercase name of tumor file, value = list of abs path of tumor+mask
        _train = {os.path.splitext(k)[0].lower():[os.path.join(self.absdir['train_tumor'],k),None]
                            for k in os.listdir(self.absdir['train_tumor'])}
        # Match filenames of Mask files to Tumors
        for k in os.listdir(self.absdir['train_tumormask']):
            prefix = re.sub('_Mask$','',os.path.splitext(k)[0]).lower()
            try:
                _train[prefix][1] = os.path.join(self.absdir['train_tumormask'],k)
            except:
                # OK to have extra masks
                print 'Warning: Unable to match Training Mask to Training Tumor: ', k
            
        # Named exclusions (delete if not in use)
        #print "WARNING: HACK EXCLUSION"
        #_train.pop('tumor_108',None)
        #_train.pop('tumor_109',None)
        
        # Convert img-mask pair to tuple to avoid accidental changes
        for k in _train.iterkeys():
            _train[k] = tuple(_train[k])
        self.train_tumor = collections.OrderedDict(sorted(_train.items()))
        
        # Not OK to have extra tumors w/o mask
        _train_havemask = [k[1][1]!=None for k in self.train_tumor.items()]
        if (min(_train_havemask)==False):
            print "Some Training Tumors have no matching Masks:"
            for name in [v[1] for v in zip(_train_havemask,self.train_tumor.keys()) if v[0]==False]:
                print name
            raise Exception('Training Tumor has no Mask')
            
        
        # Normal tissue (no masks)
        _normal = {os.path.splitext(k)[0].lower():[os.path.join(self.absdir['train_normal'],k),None]
                            for k in os.listdir(self.absdir['train_normal'])}
        # Convert img-mask pair to tuple to avoid accidental changes
        for k in _normal.iterkeys():
            _normal[k] = tuple(_normal[k])
        self.train_normal = collections.OrderedDict(sorted(_normal.items()))

    
        # Attribute list available for test set
        with open(self.absdir['test_attributes'], 'rb') as ff:
            reader = csv.reader(ff)
            test_attr = {rows[0].lower():rows[1] for rows in reader}
        
        # Match tumor & mask for test set
        # Some test data are for normal cells, and these will have no mask files
        _test = {os.path.splitext(k)[0].lower():[os.path.join(self.absdir['test_tumor'],k),None]
                            for k in os.listdir(self.absdir['test_tumor'])}
        for k in os.listdir(self.absdir['test_tumormask']):
            prefix = re.sub('_Mask$','',os.path.splitext(k)[0]).lower()
            try:
                _test[prefix][1] = os.path.join(self.absdir['test_tumormask'],k)
            except:
                # OK to have extra masks
                print 'Warning: Unable to match Test Mask to Test Tumor: ', k
        # Named exclusions (this test has to annotation)
        _test.pop('test_114',None)
                
        # Convert img-mask pair to tuple to avoid accidental changes
        for k in _test.iterkeys():
            _test[k] = tuple(_test[k])
        self.test_tumor = collections.OrderedDict(sorted(_test.items()))
        
        # Not OK to have extra tumors w/o mask
        _test_havemask = [(k[1][1]!=None and test_attr[k[0]]=='Tumor')
                            or ((k[1][1]==None and test_attr[k[0]]=='Normal'))
                          for k in self.test_tumor.items()]
        if (min(_test_havemask)==False):
            print "Some Test Tumors have no matching Masks:"
            for name in [v[1] for v in zip(_test_havemask,self.test_tumor.keys()) if v[0]==False]:
                print name
            raise Exception('Test Tumor has no Mask')

        # Prepare training set
        self.training_list = []
        _setsflag = set(sets.split("-"))
        if ('Tumor' in _setsflag):
            self.training_list += self.train_tumor.items()
            _setsflag.discard('Tumor')
        if ('Normal' in _setsflag):
            self.training_list += self.train_normal.items()
            _setsflag.discard('Normal')
        if ('Test' in _setsflag):
            self.training_list += self.test_tumor.items()
            _setsflag.discard('Test')
        assert (len(_setsflag) == 0), 'Invalid sets supplied'
        
        _length = len(self.training_list)
        if shuffle == False:
            sampleIndex = range(_length)[:int(np.round(_length*splitRatio))]
        else:
            sampleIndex = np.random.choice(range(_length),int(np.round(_length*splitRatio)),False)

        self.listTrain = [self.training_list[i] for i in sampleIndex]
        self.listValid = [self.training_list[i] for i in range(_length) if i not in sampleIndex]
        
#===================================================================================================#
# Initialize dataset paths
camelyon = CAMELYON16dataset("/home/data/CAMELYON16/")


# Declare training & validation samples
camelyon.InitDataset(splitRatio=1.0, shuffle=False, sets='Tumor')
#for i in range(0,len(camelyon.listValid)):
#    print i, camelyon.listTrain[i]

# View sets
export_level = 6
export_path = "/home/data/CAMELYON16/temp/camelyon16_export4/"
export_ext, export_type = ".png","PNG"
for key, value in camelyon.train_tumor.items()[:]:
    # Original image
    img1, attr1 = GetImage(value[0],export_level)
    fig = plt.figure()
    fig.suptitle(key)
    
    # Mask if available
    if (value[1] != None):
        img2, attr2 = GetImage(value[1],export_level)
        # Choose crop based on smallest of mask vs. image
        new_w, new_h = min(img1.size[0],img2.size[0]), min(img1.size[1],img2.size[1])
        # Crop mask
        img2 = img2.crop((0,0,new_w,new_h))
    else:
        new_h, new_w = img1.size
    # Crop original image too
    img1 = img1.crop((0,0,new_w,new_h))

    # Get fractional coordinate of image crop
    w1, w2, h1, h2 = camelyon16_crop(img1,0.005,150)
    img_tissue = CropImageRatio(img1,w1,h1,w2,h2)
    fig.add_subplot(121)
    plt.imshow(img_tissue)

    if (value[1] != None):
        img_mask = CropImageRatio(img2,w1,h1,w2,h2)
        fig.add_subplot(122)
        plt.imshow(img_mask)
        
    plt_txt = "Tumor:\n"
    plt_idx = 0
    for line in attr1:
        plt_txt += "Level: %3d Dim: %-20s Reduction: %5d\n" % (plt_idx, line[0], line[1])
        plt_idx+=1
    if (value[1] != None):
        plt_txt += "\nMask:\n"
        plt_idx = 0
        for line in attr2:
            plt_txt += "Level: %3d Dim: %-20s Reduction: %5d\n" % (plt_idx, line[0], line[1])
            plt_idx+=1
        
        print key," Adj. Orig. Size = ",img1.size," Adj. Mask Size = ",img2.size
        print key," Crop Orig. Size = ",img_tissue.size," Crop Mask Size = ",img_mask.size
    else:
        print key," Adj. Orig. Size = ",img1.size
        print key," Crop Orig. Size = ",img_tissue.size
        
    fig.text(1,0.25,plt_txt)
    
    # Export some images
    # Tumor
    exp_name = key + "_lvl_" + str(export_level) + "_fac_" + str(int(attr1[export_level][1])) + export_ext
    exp_name = os.path.join(export_path,exp_name)
    print "Exporting ",exp_name
    img1.save(exp_name,export_type)

    # TumorCrop
    '''
    exp_name = key + "_crop_lvl_" + str(export_level) + "_fac_" + str(int(attr1[export_level][1])) + export_ext
    exp_name = os.path.join(export_path,exp_name)
    print "Exporting ",exp_name
    img_tissue.save(exp_name,export_type)
    '''
    exp_name = None
    if (value[1] != None):
        exp_name = key + "_mask_lvl_" + str(export_level) + "_fac_" + str(int(attr1[export_level][1])) + export_ext
        exp_name = os.path.join(export_path,exp_name)
        print "Exporting ",exp_name
        img2.save(exp_name,export_type)
        '''
        exp_name = key + "_mask_crop_lvl_" + str(export_level) + "_fac_" + str(int(attr1[export_level][1])) + export_ext
        exp_name = os.path.join(export_path,exp_name)
        print "Exporting ",exp_name
        img_mask.save(exp_name,export_type)
        '''

tumor_001  Adj. Orig. Size =  (1528, 3456)  Adj. Mask Size =  (1528, 3456)
tumor_001  Crop Orig. Size =  (1527, 1979)  Crop Mask Size =  (1527, 1979)
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_001_lvl_6_fac_62.png
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_001_mask_lvl_6_fac_62.png
tumor_002  Adj. Orig. Size =  (1528, 3432)  Adj. Mask Size =  (1528, 3432)
tumor_002  Crop Orig. Size =  (1458, 2418)  Crop Mask Size =  (1458, 2418)
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_002_lvl_6_fac_62.png
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_002_mask_lvl_6_fac_62.png
tumor_003  Adj. Orig. Size =  (1528, 3456)  Adj. Mask Size =  (1528, 3456)
tumor_003  Crop Orig. Size =  (1527, 2807)  Crop Mask Size =  (1527, 2807)
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_003_lvl_6_fac_62.png
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_003_mask_lvl_6_fac_62.png
tumor_004  Adj. Orig. Size =  (



tumor_021  Adj. Orig. Size =  (1528, 3448)  Adj. Mask Size =  (1528, 3448)
tumor_021  Crop Orig. Size =  (1527, 2712)  Crop Mask Size =  (1527, 2712)
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_021_lvl_6_fac_62.png
Exporting  /home/data/CAMELYON16/temp/camelyon16_export4/tumor_021_mask_lvl_6_fac_62.png


In [11]:
# Quick-n-dirty import single
%matplotlib inline

import openslide
import os
import matplotlib.pyplot as plt
import numpy as np

def get_image(oslideimg,level):
    if (level >= oslideimg.level_count):
            print "level > level_count"
            return
    else:
        img = oslideimg.read_region((0,0),level,oslideimg.level_dimensions[level])
        return img

#def view_image(img):
#    imshow(np.asarray(img))

img = openslide.OpenSlide("/home/data/CAMELYON16/TrainingData/Train_Tumor/tumor_101.tif")
mask = openslide.OpenSlide("/home/data/CAMELYON16/TrainingData/Ground_Truth/Mask/Tumor_101_Mask.tif")

print "# Levels: ",img.level_count
print "Dimensions for each Level in img:"
for i in range(-1,img.level_count):
    print i,": ",img.level_dimensions[i]," Downsampling = ",img.level_downsamples[i]

print "# Levels: ",mask.level_count
print "Dimensions for each Level in mask:"
for i in range(-1,mask.level_count):
    print i,": ",mask.level_dimensions[i]," Downsampling = ",mask.level_downsamples[i]

# Set 'level' i.e. resolution to extract (0 = best)
level = 2
img_ext = get_image(img,level)
mask_ext = get_image(mask,level)

#plt.imshow(img_ext)
img_ext.save("./Tumor_101_L2.png","PNG")
mask_ext.save("./Mask_101_L2.png","PNG")

# Levels:  8
Dimensions for each Level in img:
-1 :  (512, 512)  Downsampling =  112.5
0 :  (61440, 53760)  Downsampling =  1.0
1 :  (30720, 27136)  Downsampling =  1.99056603774
2 :  (15360, 13824)  Downsampling =  3.94444444444
3 :  (7680, 7168)  Downsampling =  7.75
4 :  (4096, 3584)  Downsampling =  15.0
5 :  (2048, 2048)  Downsampling =  28.125
6 :  (1024, 1024)  Downsampling =  56.25
7 :  (512, 512)  Downsampling =  112.5
# Levels:  7
Dimensions for each Level in mask:
-1 :  (960, 840)  Downsampling =  64.0
0 :  (61440, 53760)  Downsampling =  1.0
1 :  (30720, 26880)  Downsampling =  2.0
2 :  (15360, 13440)  Downsampling =  4.0
3 :  (7680, 6720)  Downsampling =  8.0
4 :  (3840, 3360)  Downsampling =  16.0
5 :  (1920, 1680)  Downsampling =  32.0
6 :  (960, 840)  Downsampling =  64.0
