<a href="https://colab.research.google.com/github/vaguiar/camelyon_dl_2019/blob/data-exploration/data_extraction_at_levels_from_GCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openslide-tools is already the newest version (3.4.1+dfsg-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
import io
from PIL import Image
from skimage.color import rgb2gray

### Setting up Google Cloud Platform

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
GCP_PROJECT_ID = 'triple-voyage-239123'
GCP_BUCKET_NAME = 'vaa2114_dl_2019'

In [5]:
!gcloud config set project {GCP_PROJECT_ID}

Updated property [core/project].


In [0]:
from google.cloud import storage

CLIENT = storage.Client(project=GCP_PROJECT_ID)
BUCKET_ = CLIENT.bucket(GCP_BUCKET_NAME)

def save_to_gcp(gcp_file_name, file_buf):
  
  blob = BUCKET_.blob(gcp_file_name)
  
  # upload buffer contents to gcs
  blob.upload_from_string(
      file_buf.getvalue(),
      content_type='image/png')
  
  # gcs url to uploaded matplotlib image
  url = blob.public_url
  

### Setting up Drive to read data

In [7]:
## load data fom google drive 
from google.colab import drive
drive.mount('/tmp/drive')

Drive already mounted at /tmp/drive; to attempt to forcibly remount, call drive.mount("/tmp/drive", force_remount=True).


In [0]:
BASE_DIR = '/tmp/drive/My Drive/slides/'

images = os.listdir(BASE_DIR)
masks = [s for s in images if "_mask.tif" in s]
masks.sort()
slides = list(set([s for s in images if ".tif" in s]) - set(masks))
slides.sort()
slides.remove('tumor_038.tif')

masks_slides = zip(masks, slides)
masks_slides  = list(masks_slides)

### Read slides from Google Cloud Platform bucket

In [0]:
# BASE_DIR = 'gs://vaa2114_dl_2019/slides'

# images = !gsutil ls {BASE_DIR} | cut -d / -f 5

# masks = [s for s in images if "_mask.tif" in s]
# masks.sort()
# slides = list(set(images).difference(set(masks)))
# slides.sort()

# print(masks)
# print(slides)

# masks_slides = zip(masks, slides)
# masks_slides  = list(masks_slides)

# print(masks_slides)

In [0]:
import random

def getSlide_Mask(slide_file, tumor_mask_file):
  
  slide_path = os.path.join(BASE_DIR, slide_file)
  tumor_mask_path = os.path.join(BASE_DIR, tumor_mask_file)

#   slide_path_gcp = os.path.join(BASE_DIR, slide_file)
#   tumor_mask_path_gcp = os.path.join(BASE_DIR, tumor_mask_file)

#   slide_path = os.path.join("/tmp/files/", slide_file)
#   tumor_mask_path = os.path.join("/tmp/files/", tumor_mask_file)
  
#   !gsutil cp {slide_path_gcp} {slide_path}
#   !gsutil cp {tumor_mask_path_gcp} {tumor_mask_path}

  slide = open_slide(slide_path)
  tumor_mask = open_slide(tumor_mask_path)
  
  # some slides have more than 8 levels, since we will only use the first 8 levels 
  # we will only assert that the first 8 levels have the appropriate dimensions 
  for i in range(8):  
    assert tumor_mask.level_dimensions[i][0] == slide.level_dimensions[i][0]
    assert tumor_mask.level_dimensions[i][1] == slide.level_dimensions[i][1]

  # Verify downsampling works as expected
  width, height = slide.level_dimensions[7]
  assert width * slide.level_downsamples[7] == slide.level_dimensions[0][0]
  assert height * slide.level_downsamples[7] == slide.level_dimensions[0][1]
  
  return slide, tumor_mask

def read_slide(slide, x, y, level, width, height, as_float=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im
  
  
def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return zip(indices[0], indices[1])

  

  

def getTraining_Data(level, slide, 
                     tumor_mask, 
                     slide_window, 
                     tumor_path, 
                     non_tumor_path,
                     slide_file,
                     is_sampling=False,
                     min_x = 0, 
                     min_y = 0, 
                     cancer_thres = 0, 
                     tissue_thres = .2):
  '''
  use a sliding window filter of size (slide_height, slide_width) 
  save in cancer folder if more than 30% of pixels are cancerous 
  save in non_cancer folder if more than 10% of the slide is pixels 
  '''
  
 
  down_sample = int(slide.level_downsamples[level])
  pixel_width = int(down_sample*slide.level_dimensions[level][0])
  pixel_height = int(down_sample*slide.level_dimensions[level][1])
  
  # loop through image pixels 
  for x in range(min_x*down_sample, pixel_width, slide_window*down_sample):
    for y in range(min_y*down_sample, pixel_height, slide_window*down_sample):

      # calculate slide width and slide height if image size starting at 
      # (x,y) is less than (slide_width, slide_height)
      slide_width_2 =  min(slide_window , slide.level_dimensions[level][0] - x/down_sample)
      slide_height_2 = min(slide_window , slide.level_dimensions[level][1] - y/down_sample)

      slide_image = read_slide(slide, 
                           x=x, 
                           y=y, 
                           level=level, 
                           width= int(slide_width_2) , 
                           height= int(slide_height_2) )
      
      
      # find % of tissues on slide_image
      tissue_pixels = find_tissue_pixels(slide_image)
      num_pixels = len(list(tissue_pixels))
      pcnt_tissue = num_pixels/ float(slide_image.shape[0] * slide_image.shape[0])


      mask_image = read_slide(tumor_mask,
                              x=x, 
                              y=y, 
                              level=level, 
                              width= int(slide_width_2) , 
                              height= int(slide_height_2) )

      mask_image = mask_image[:,:,0]

      # get % of cancer cells in mask 
      num_cancer =  np.sum(mask_image)
      
      # if the image is not a square do not save as it is likely the end of the slide 
      # this improves efficieny 
      if slide_width_2 != slide_height_2:
        save_dir = 'DISCARD'
      
      # if % cancer > cancer_thres then save to cancer directory
      elif num_cancer  > cancer_thres:
        save_dir = tumor_path 
        
      # if non-cancerous and percent of tissues is greater than tissue threshold
      # save to the non-cancerous directory
      elif ((num_cancer <=  cancer_thres) & (pcnt_tissue >= tissue_thres)):
        if (is_sampling and random.randint(0,1)):
            save_dir = 'DISCARD' 
            name = save_dir + "/" + "level_" + str(level) + "_" + str(x) + "_" + str(y)+ "_" + str(slide_window) + "_" + slide_file
            print("Discard Non tumor image: ", name)     
        else: 
          save_dir = non_tumor_path
        
      else:
        save_dir = 'DISCARD' 
   
      
      # save images if not meant to be discarded.       
      if save_dir == 'DISCARD':      
        pass 
      
      else: 
        name = save_dir + "/" + "level_" + str(level) + "_" + str(x) + "_" + str(y)+ "_" + str(slide_window) + "_" + slide_file

        print("# tumor pxls:{}, % tumor pxls:{}, file_name:{}".format(num_cancer , pcnt_tissue, name))
        plt.figure(figsize=(10,10), dpi=100)
        plt.imshow(slide_image)
        
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        save_to_gcp(name, buf)
        
        buf.close()
        
        plt.close()

In [0]:
def extract_data_by_level(level, 
                          masks_slides,
                          slide_window,
                          is_sampling):

  
  GCP_TRAIN_TUMOR_FOLDER = "train/level{}/tumor".format(level)
  GCP_TRAIN_NON_TUMOR_FOLDER = "train/level{}/non_tumor".format(level)
  
  GCP_VAL_TUMOR_FOLDER = "validate/level{}/tumor".format(level)
  GCP_VAL_NON_TUMOR_FOLDER = "validate/level{}/non_tumor".format(level)
  
  GCP_TEST_TUMOR_FOLDER = "test/level{}/tumor".format(level)
  GCP_TEST_NON_TUMOR_FOLDER = "test/level{}/non_tumor".format(level)

  print("GCP Train Data Paths:")
  print(GCP_TRAIN_TUMOR_FOLDER)
  print(GCP_TRAIN_NON_TUMOR_FOLDER)
  
  print("\nGCP Validate Data Paths:")
  print(GCP_VAL_TUMOR_FOLDER)
  print(GCP_VAL_NON_TUMOR_FOLDER)
  
  print("\nGCP Test Data Paths:")
  print(GCP_TEST_TUMOR_FOLDER)
  print(GCP_TEST_NON_TUMOR_FOLDER)
  
  for i in range(len(masks_slides)):
  
    print("\nExtracting Data w/ level={}, window_size={}, image_number={}".format(level, slide_window, i))
    slide_masks = masks_slides[i]
    slide_file = slide_masks[1]
    tumor_mask_file =slide_masks[0]
    slide, tumor_mask = getSlide_Mask(slide_file, tumor_mask_file)

    if (i+1)%5 == 0:
      TUMOR_FOLDER = GCP_VAL_TUMOR_FOLDER
      NON_TUMOR_FOLDER = GCP_VAL_NON_TUMOR_FOLDER
      is_sampling = False      # Turn off sampling for Validation set
    elif (i+1)%9 == 0:
      TUMOR_FOLDER = GCP_TEST_TUMOR_FOLDER
      NON_TUMOR_FOLDER = GCP_TEST_NON_TUMOR_FOLDER
      is_sampling = False      # Turn off sampling for Test set
    else:
      TUMOR_FOLDER = GCP_TRAIN_TUMOR_FOLDER
      NON_TUMOR_FOLDER = GCP_TRAIN_NON_TUMOR_FOLDER

    getTraining_Data(level, 
                     slide, 
                     tumor_mask, 
                     slide_window, 
                     TUMOR_FOLDER, 
                     NON_TUMOR_FOLDER, 
                     slide_file,
                     is_sampling)

## Running data extractor by level

The new image file stored has the following naming convention:

```
level_<level_num>_<window_x_coord>_<window_y_coord>_<window_size>_<filename>
```

**Sliding Window Size Calculations**

| Level  |   Zoom Factor |  Window Size | 
|---|---|---|
|  7 |  1x |  200  |
|  6 |  2x |   |
|   5|  4x |  400 |
|   4|  8x |   |
|   3|  16x | 800 |
|   2| 32x |   |
|   1|  64x |   3200|
|   0|  128x |   |


### Running for level 7 with window size 50

In [12]:
LEVEL = 7
SLIDING_WINDOW = 50
IS_SAMPLING = False

extract_data_by_level(LEVEL, 
                      masks_slides,
                      SLIDING_WINDOW,
                      IS_SAMPLING)

GCP Train Data Paths:
train/level7/tumor
train/level7/non_tumor

GCP Validate Data Paths:
validate/level7/tumor
validate/level7/non_tumor

GCP Test Data Paths:
test/level7/tumor
test/level7/non_tumor

Extracting Data w/ level=7, window_size=50, image_number=0
# tumor pxls:0, % tumor pxls:0.2628, file_name:train/level7/non_tumor/level_7_0_0_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.3332, file_name:train/level7/non_tumor/level_7_0_121600_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.8436, file_name:train/level7/non_tumor/level_7_0_128000_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.3112, file_name:train/level7/non_tumor/level_7_0_134400_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.258, file_name:train/level7/non_tumor/level_7_6400_0_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.212, file_name:train/level7/non_tumor/level_7_6400_12800_50_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.3264, file_name:train/level7/non_tumor/level_7_6400_76800_50_tumor_001.tif
# tumor pxls:0, %

### Running for level 5 with window size 400

In [0]:
# LEVEL = 5
# SLIDING_WINDOW = 400
# IS_SAMPLING = False

# extract_data_by_level(LEVEL, 
#                       masks_slides,
#                       SLIDING_WINDOW,
#                       IS_SAMPLING)

### Running for level 3 with window size 800

In [16]:
LEVEL = 3
SLIDING_WINDOW = 800
IS_SAMPLING = True

extract_data_by_level(LEVEL, 
                      masks_slides,
                      SLIDING_WINDOW, 
                      IS_SAMPLING)

GCP Train Data Paths:
train/level3/tumor
train/level3/non_tumor

GCP Validate Data Paths:
validate/level3/tumor
validate/level3/non_tumor

GCP Test Data Paths:
test/level3/tumor
test/level3/non_tumor

Extracting Data w/ level=3, window_size=800, image_number=0
# tumor pxls:0, % tumor pxls:0.252125, file_name:train/level3/non_tumor/level_3_0_0_800_tumor_001.tif
Discard Non tumor image:  DISCARD/level_3_0_121600_800_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.803678125, file_name:train/level3/non_tumor/level_3_0_128000_800_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.28148125, file_name:train/level3/non_tumor/level_3_0_134400_800_tumor_001.tif
Discard Non tumor image:  DISCARD/level_3_6400_0_800_tumor_001.tif
Discard Non tumor image:  DISCARD/level_3_6400_76800_800_tumor_001.tif
# tumor pxls:0, % tumor pxls:0.242978125, file_name:train/level3/non_tumor/level_3_6400_83200_800_tumor_001.tif
Discard Non tumor image:  DISCARD/level_3_6400_121600_800_tumor_001.tif
# tumor pxls:0, % tumor pxl

### Running for level 1 with window size 3200

In [0]:
# LEVEL = 1
# SLIDING_WINDOW = 3200
# IS_SAMPLING = True

# extract_data_by_level(LEVEL, 
#                       masks_slides,
#                       SLIDING_WINDOW,
#                       IS_SAMPLING)