## Segmentation-free profiling of organoid screens

In [None]:
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import seaborn as sn
import pandas as pd

javabridge.start_vm(class_path=bf.JARS)

In [None]:
path = '../data/BortenMA_OrganoSeg_FileS3/Breast Raw Images/'

In [None]:
imgpath = os.path.join(path, 'BT549')
fnames = [f for f in os.listdir(imgpath) if 'tiff' in f]
fnames.sort()

In [None]:
from bioimg import read_image, plot_channels

In [None]:
imgs = [read_image(os.path.join(imgpath, f), verbose=False) for f in fnames]

In [None]:
imgs[0].shape

In [None]:
plot_channels([imgs[0][:,:,i] for i in range(imgs[0].shape[-1])],
              scale_x=6, scale_y=6,
              nrow=1, ncol=3, cmap='gray')
plt.show()

In [None]:
imgpath = os.path.join(path, 'DCIS.CoM')
fnames = [f for f in os.listdir(imgpath) if 'tiff' in f]
fnames.sort()

In [None]:
imgs = [read_image(os.path.join(imgpath, f), verbose=False) for f in fnames]

In [None]:
plot_channels([imgs[1][:,:,i] for i in range(imgs[0].shape[-1])],
              scale_x=6, scale_y=6,
              nrow=1, ncol=3, cmap='gray')
plt.show()

## Train an Unsupervised Segmentation-Free Model
Load 3 images of each breast cancer cell line organoid

In [None]:
cell_lines = [f for f in os.listdir(path) if not f.startswith('.')]

In [None]:
import random

In [None]:
def load_cell_line(cell_line):
    imgpath = os.path.join(path, cell_line)
    fnames = [f for f in os.listdir(imgpath) if 'tiff' in f]
    # sample randomly 3 files
    random.sample(fnames, 3)

In [None]:
random.seed(2207)
imgs_br = []
titles = []
for cl in cell_lines:
    imgpath = os.path.join(path, cl)
    fnames = [f for f in os.listdir(imgpath) if 'tiff' in f]
    # sample randomly 3 files
    fnames = random.sample(fnames, 3)
    imgs = [read_image(os.path.join(imgpath, f), verbose=False) for f in fnames]
    imgs_br += imgs
    titles += fnames

In [None]:
from skimage.filters import gaussian

In [None]:
def subtract_background(img, sigma):
    return img - gaussian(img, sigma=sigma)

In [None]:
img_bgsub = [np.stack([subtract_background(img[:,:,i], sigma=50) for i in range(img.shape[-1])], axis=-1) for img in imgs_br]

In [None]:
def minmax_scale(a):
    return (a - a.min()) / (a.max() - a.min())

def normalize_channels(img):
    return np.stack([minmax_scale(img[:,:,i]) for i in range(img.shape[-1]) ], axis=-1)

In [None]:
imgs_norm = [normalize_channels(img) for img in img_bgsub ]

In [None]:
from bioimg import SegfreeProfiler
segf = SegfreeProfiler(tile_size=(30,30),
                       n_block_types=20,
                       n_supblock_types=20,
                       n_components=50)

In [None]:
tiles = segf.tile_color_images(imgs_norm)

In [None]:
plot_channels([tiles[0][i] for i in range(tiles[0].shape[0])], 
              ncol=34, nrow=25, wspace=0.05, hspace=0.05)
plt.show()

In [None]:
segf_prof = segf.fit_transform(imgs_norm)

In [None]:
plt.plot(np.cumsum(segf.pca.explained_variance_ratio_), linewidth=3)
sn.despine()
plt.axhline(y=1, color='black', linestyle=':')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance')

In [None]:
# numpber of optical sections: 3
nstacks = 3
eigentiles = segf.pca.components_.reshape((segf.n_components, *segf.tile_size, nstacks))

In [None]:
plot_channels([np.max(eigentiles[i], axis=-1) for i in range(segf.n_components)],
              nrow=5, ncol=10, scale_x=2, scale_y=2)
plt.show()

In [None]:
segf_prof.index = [t.replace('.tiff', '') for t in titles]

In [None]:
'''from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-4).fit(segf_prof)
hmap = sn.clustermap(segf_prof.loc[:,sel.get_support()], metric='euclidean')'''

## Feature Selection by Replicate Correlation

In [None]:
def load_replicate(path, cell_lines, random_state):
    random.seed(random_state)
    imgs = []
    for cl in cell_lines:
        imgpath = os.path.join(path, cl)
        fnames = [f for f in os.listdir(imgpath) if 'tiff' in f]
        imgs.append(read_image(os.path.join(imgpath, random.choice(fnames)), verbose=False))
    return imgs

In [None]:
rep1 = load_replicate(path=path, cell_lines=cell_lines, random_state=2407)
rep2 = load_replicate(path=path, cell_lines=cell_lines, random_state=7042)
rep3 = load_replicate(path=path, cell_lines=cell_lines, random_state=2020)

In [None]:
rep1_df = segf.transform(rep1)
rep2_df = segf.transform(rep2)
rep3_df = segf.transform(rep3)

In [None]:
df1 = pd.concat([rep1_df, rep1_df, rep2_df],axis=0)
df2 = pd.concat([rep2_df, rep3_df, rep3_df], axis=0)
repcor = df1.corrwith(df2)

In [None]:
repcor[repcor > 0.6]

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data
segf_prof = segf_prof[repcor[repcor > 0.6].index]
scaler = StandardScaler().fit(X=segf_prof)
segf_scaled = scale_data(segf_prof, scaler=scaler)
segf_scaled.index = segf_prof.index
hmap = sn.clustermap(segf_scaled, metric='euclidean')

In [None]:
segf_scaled.index[hmap.dendrogram_row.reordered_ind]

In [None]:
np.array(hmap.dendrogram_row.reordered_ind)

In [None]:
plot_channels([imgs_norm[18][:,:,i] for i in range(imgs_norm[0].shape[-1])],
              scale_x=6, scale_y=6,
              nrow=1, ncol=3, cmap='gray')
plt.show()

## Colorectal Cancer Organoids in 3 Patients

In [None]:
path = '../data/BortenMA_OrganoSeg_FileS3/Colon Raw Images/'

In [None]:
patients = [f for f in os.listdir(path) if not f.startswith('.')]

In [None]:
random.seed(2207)
imgs_crc = []
titles = []
for pat in patients:
    imgpath = os.path.join(path, pat)
    fnames = [f for f in os.listdir(imgpath) if 'tif' in f]
    imgs = [read_image(os.path.join(imgpath, f), verbose=False) for f in fnames]
    imgs_crc += imgs
    titles += fnames

In [None]:
# only 3D data with 3 optical sections
titles = [t for t, img in zip(titles, imgs_crc) if img.shape[-1] == 3]
imgs_crc = [img for img in imgs_crc if img.shape[-1] == 3]

In [None]:
len(imgs_crc)

In [None]:
img_bgsub = [np.stack([subtract_background(img[:,:,i], sigma=50) for i in range(img.shape[-1])], axis=-1) for img in imgs_crc]

In [None]:
imgs_norm = [normalize_channels(img) for img in img_bgsub ]

In [None]:
segf = SegfreeProfiler(tile_size=(20,20),
                       n_block_types=10,
                       n_supblock_types=20,
                       n_components=50)

In [None]:
tiles = segf.tile_color_images(imgs_norm)

In [None]:
'''plot_channels([tiles[0][i] for i in range(tiles[0].shape[0])], 
              ncol=43, nrow=32, wspace=0.05, hspace=0.05)
plt.show()'''

In [None]:
segf_prof = segf.fit_transform(imgs_norm)

In [None]:
plt.plot(np.cumsum(segf.pca.explained_variance_ratio_), linewidth=3)
sn.despine()
plt.axhline(y=1, color='black', linestyle=':')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance')

In [None]:
# numpber of optical sections: 3
nstacks = 3
eigentiles = segf.pca.components_.reshape((segf.n_components, *segf.tile_size, nstacks))

In [None]:
plot_channels([np.max(eigentiles[i], axis=-1) for i in range(segf.n_components)],
              nrow=5, ncol=10, scale_x=2, scale_y=2)
plt.show()

In [None]:
segf_prof.index = [t.replace('.tif', '') for t in titles]
#segf_prof = segf_prof[[col for col in segf_prof.columns.values if 'superblock' in col]]

Path to colon cancer organoid morphological profiles:

In [None]:
datadir = '../data/BortenMA_OrganoSeg_FileS3/Colon Excel Sheets/'

In [None]:
def parse_xls(fname):
    df = pd.read_excel(fname, None)
    df = pd.concat(df).droplevel(-1).drop(columns=['Spheroid #'])
    return df.groupby(df.index).agg('mean')

In [None]:
morph_data = []
for pat in patients:
    patdir = os.path.join(datadir, pat)
    xls_files = os.listdir(patdir)
    morph_data += [parse_xls(os.path.join(patdir, f)) for f in xls_files]

In [None]:
morph_df = pd.concat(morph_data)

In [None]:
morph_df = morph_df[np.isin(morph_df.index, segf_prof.index)]

In [None]:
scaler = StandardScaler().fit(X=morph_df)
morphdf_scaled = scale_data(morph_df, scaler=scaler)

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-4).fit(segf_prof)
scaler = StandardScaler().fit(X=segf_prof.loc[:,sel.get_support()])
segf_scaled = scale_data(segf_prof.loc[:,sel.get_support()], scaler=scaler)

In [None]:
morphdf_scaled.index = morph_df.index

In [None]:
segf_scaled.index = segf_prof.index

In [None]:
cormap = pd.concat([segf_scaled, morphdf_scaled], axis=1, keys=['segf', 'morph']).corr().loc['segf', 'morph']

In [None]:
maxcor = cormap.abs().max()

In [None]:
maxcor[maxcor > 0.5]

In [None]:
cormap.max().max()

In [None]:
from bioimg.singlecell import plot_heatmap
plot_heatmap(cormap, 
             xticklabels=True,
             yticklabels=True,
             size=(12.5,12))

## Feature Selection by Replicate Correlation
Create "virtual" replicates by cutting images in two:

In [None]:
rep1 = [img[:,:432,:] for img in imgs_norm]
rep2 = [img[:,432:,:] for img in imgs_norm]

In [None]:
assert(len(rep1) == len(rep2))

In [None]:
rep1_df = segf.transform(rep1)

In [None]:
rep2_df = segf.transform(rep2)

In [None]:
repcor = rep1_df.corrwith(rep2_df)

In [None]:
repcor[repcor > 0.7]

In [None]:
hmap = sn.clustermap(segf_scaled[repcor[repcor > 0.7].index], metric='euclidean')

In [None]:
#segf_scaled[repcor[repcor > 0.6].index].to_csv('colorectal_organoids.csv')

In [None]:
segf_scaled.index[hmap.dendrogram_row.reordered_ind]

In [None]:
np.array(hmap.dendrogram_row.reordered_ind)

In [None]:
ind = 27
plt.figure(figsize=(10,8))
plt.imshow(np.max(imgs_norm[ind], axis=-1), cmap='gray')
plt.title(titles[ind].replace('.tif', ''))
plt.axis('off')