# Tiling and Embeddings notebook

This notebook helps you:
* exploring the data set
* computing and saving the tiles from the WSI image
* computing and saving the embeddings from the tiles

In [None]:
import os
os.add_dll_directory("C:\\Users\\33631\\Desktop\\openslide-win64-20171122\\bin")
import openslide

In [None]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import os
import openslide
import random
import cv2
import matplotlib
from sklearn.model_selection import train_test_split
from skimage.filters import threshold_otsu
import re

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torchvision.models as models
import torchvision
import torch
import torch.nn as nn
from torch.optim import Adam
from torch import LongTensor as LongTensor
from torch import FloatTensor as FloatTensor
import pickle

In [None]:
train_set_path = "Data/raw_data/train.csv"
test_set_path = "Data/raw_data/test.csv"
test_images_folder = "Data/raw_data/test/test/"
train_images_folder = "Data/raw_data/train/train/"
train_mask_folder = "Data/raw_data/train_label_masks/train_label_masks/"


train_tiles_folder = "Data/processed_data/train_tiles_grid/"
test_tiles_folder = "Data/processed_data/test_tiles_grid/"
train_tiles_folder_encoding = "Data/processed_data/train_tiles_encoding_grid/"
test_tiles_folder_encoding = "Data/processed_data/test_tiles_encoding_grid/"

## Usefull stuff

In [None]:
cmap = matplotlib.colors.ListedColormap(['black', 'gray', 'green', 'yellow', 'orange', 'red'])

## Load and explore data

In [None]:
train_set = pd.read_csv(train_set_path)
test_set = pd.read_csv(test_set_path)

In [None]:
# let's check the correspondance between isup and gleason
(train_set[['isup_grade','gleason_score']]
    .groupby('isup_grade')
    .agg({'gleason_score': lambda x: x.unique()})
)

In [None]:
# count isup grades
(train_set[['isup_grade','gleason_score']]
    .groupby('isup_grade')
    .count()
)

In [None]:
# image provider
(train_set[['data_provider','gleason_score']]
    .groupby('data_provider')
    .count()
)

## Visualisations

In [None]:
def visualise_mask_and_image(id, level = 3): 
    """
    function to plot images and masks
    """
    line = train_set.iloc[id]

    data_provider = line.data_provider
    isup_grade = line.isup_grade
    gleason_score = line.gleason_score
    image_id = line.image_id

    image = openslide.OpenSlide(train_images_folder+image_id+'.tiff')
    print(image.level_dimensions)
    image_data = image.read_region((0,0), image.level_count - 1, image.level_dimensions[-1])
    try:
        mask = openslide.OpenSlide(train_mask_folder+image_id+'.tiff')
        mask_data = mask.read_region((0,0), mask.level_count - 1, mask.level_dimensions[-1])
    except:
        print('no mask for this image')
        mask = None
        mask_data = None


    fig, axes = plt.subplots(1,2, figsize=(8,8))
    axes[0].imshow(image_data)
    axes[1].imshow(np.asarray(mask_data)[:,:,0], cmap=cmap, interpolation='nearest', vmin=0, vmax=5)

    title = f"gleason score: {gleason_score} - isup grade: {isup_grade} \n data provider: {data_provider}"
    fig.suptitle(title, fontsize=14)
    
    mask.close()
    image.close()
    fig.tight_layout()
    plt.show()


In [None]:
visualise_mask_and_image(4)

In [None]:
visualise_mask_and_image(2)

In [None]:
visualise_mask_and_image(15)

# Tiling functions

In [None]:
from histolab.tiler import RandomTiler, GridTiler
from histolab.slide import Slide

In [None]:
def create_tiles(df, folder_dest, folder_source, tile_shape = 224, level = 0, n_tiles = 128, random = True):
    """
    This function create the tiles from the image in a folder
    inputs:
        - df: train or test set
        - folder dest: where to save the tiles
        - folder source: where the tiff images are stored
        - tile shape: dimension of the tiles (default: (224, 224))
        - level: level on which the tiles are extracted
        - number of tiles to extract if random = True
        - random: whether to implement random tiling or total grid tiling

    """
    for i in tqdm(range(df.shape[0]), position = 0):
        value = df.iloc[i]
        id = value.image_id
        if os.path.isdir(folder_dest+f"{id}/"):
            continue
        image = Slide(folder_source+id+".tiff", processed_path=folder_dest)
        if random: 
            tiles_extractor = RandomTiler(
                        tile_size=(tile_shape,tile_shape),
                        level=level,
                        n_tiles=n_tiles,
                        seed=42,
                        check_tissue=True, # default
                        tissue_percent=80.0, # default
                        prefix=f'{id}/', # save tiles in the "random" subdirectory of slide's processed_path
                        suffix=".png" # default
                    )
        else:
            tiles_extractor = GridTiler(
                        tile_size=(tile_shape,tile_shape),
                        level=level,
                        check_tissue=True, # default
                        tissue_percent=90.0, # default
                        pixel_overlap = 0, 
                        prefix=f'{id}/', # save tiles in the "random" subdirectory of slide's processed_path
                        suffix=".png" # default
                    )

        tiles_extractor.extract(image)

## Tiles encoding functions

In [None]:
def extract_tiles_coords(img_name):
    """
    extract tile coordinate from the file name
    """
    exp = '([0-9]*)-([0-9]*)-([0-9]*)-([0-9]*).png$'
    match = re.search(exp, img_name)
    x_ul_wsi = match[1]
    y_ul_wsi = match[2]
    x_br_wsi = match[3]
    y_br_wsi = match[4]
    return(int(x_ul_wsi),int(y_ul_wsi), int(x_br_wsi),int(y_br_wsi))

In [None]:
# define encoder model
# we took efficientNetB2 for its good performances and for its relative light weight
encoder = models.efficientnet_b2(pretrained=True)
encoder = nn.Sequential(*list(encoder.children()))[:-1].eval().to("cuda")

In [None]:
def encode_tiles(df, source_dir, dest_dir, encoder):
    """
    this fucntion compute and save the emebddings for all the tiles in an image folder
    inputs:
        - df: test or train data set
        - source dire: directory path containing the tiles
        - desd_r: where to save the embeddings
        - encoder: model to encode the tiles
    """
    for i in tqdm(range(df.shape[0]), position = 0):
        value = df.iloc[i]
        image_id = value.image_id
        if os.path.isdir(dest_dir+f'{image_id}.pkl'):
            continue
        # compute file
        image_folder = source_dir+image_id+'/'
        # pick images to put in the bag randomly
        tiles_list = np.array(os.listdir(image_folder))
        encoding_dict = {}
        for img_name in tiles_list:
            img_path = image_folder+img_name
            x_ul_wsi,y_ul_wsi, x_br_wsi,y_br_wsi = extract_tiles_coords(img_name)
            # get image
            tile_image = np.array(Image.open(img_path))
            # convert to RGB
            tile_image = Image.fromarray(tile_image)
            tile_image = tile_image.convert('RGB')
            # to 0/1 range
            tile_image = torchvision.transforms.functional.to_tensor(tile_image) # to 0/1 range and permute 

            normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
            tile_image = normalize(tile_image).unsqueeze(0).to("cuda")
            # get encoding
            encoding = encoder(tile_image).squeeze(-1).squeeze(-1)[0].cpu().detach().numpy()
            # add to dict
            encoding_dict[(x_ul_wsi,y_ul_wsi, x_br_wsi,y_br_wsi)] = encoding
        pickle.dump(encoding_dict, open(dest_dir+f'{image_id}.pkl', 'wb'))

## Compute and encode tiles

In [None]:
# params

tiles_shape  = 224
level = 0
n_tiles = 300

### For the train set

In [None]:
create_tiles(train_set, folder_dest = train_tiles_folder, folder_source=train_images_folder , tile_shape = tiles_shape, level = level, n_tiles = n_tiles, random = False)

In [None]:
encode_tiles(train_set, source_dir = train_tiles_folder, dest_dir = train_tiles_folder_encoding, encoder = encoder)

### For the test set

In [None]:
# for test set
create_tiles(test_set, folder_dest = test_tiles_folder, folder_source=test_images_folder , tile_shape = tiles_shape, level = level, n_tiles = n_tiles, random = False)

In [None]:
# for train set
encode_tiles(test_set, source_dir = test_tiles_folder, dest_dir = test_tiles_folder_encoding, encoder = encoder)
