# Introduction: Segmentation-free profiling

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import pandas as pd

javabridge.start_vm(class_path=bf.JARS)

In [None]:
path = '/Volumes/gitlab/microscopy-notebooks/data/cytodata/datasets/CDRPBIO-BBBC036-Bray/images/CDRPBIO-BBBC036-Bray'

In [None]:
chemannot = pd.read_csv("../data/Bray-metadata/plate_annot.txt", sep='\t')

In [None]:
noisywells = pd.read_csv('../data/Bray-noisywells.csv', index_col=0)
noisywells = noisywells[noisywells['count'] > 3].reset_index(drop=True)

chemannot = pd.merge(left=chemannot,
         right=noisywells[['plateID', 'well_position']],
        how='outer', indicator=True)
chemannot = (chemannot[chemannot['_merge'] == 'left_only'].
             drop(columns='_merge').reset_index(drop=True))

In [None]:
# compounds with strong phenotypes based
# on mean well profiles
drugs = ['vinblastine sulfate',
         'dorsomorphin',
         'amthamine',
         'carboxin',
         'resveratrol',
         'KN-93']

In [None]:
def load_drug_images(path, chemannot, drug, which=0):
    '''Retrun list of CellPainting image file names for the selected drug
    '''
    cmpd_annot = chemannot[chemannot['CPD_NAME'] == drug].reset_index(drop=True)
    plate = cmpd_annot['plateID'][which]
    well = cmpd_annot['well_position'][which]
    
    imgpath = os.path.join(path, str(plate))
    imgfiles = os.listdir(imgpath)
    cmpd_imgs = [f for f in imgfiles if "_" + well + "_" in f]
    
    ctrl_annot = chemannot.loc[np.logical_and(chemannot['broad_sample'] == 'DMSO',
                               chemannot['plateID'] == plate),:].reset_index(drop=True)
    well = ctrl_annot['well_position'][which]
    ctrl_imgs = [f for f in imgfiles if "_" + well + "_" in f]
    
    return imgpath, cmpd_imgs, ctrl_imgs

In [None]:
imgpath, cmpd_imgs, ctrl_imgs = load_drug_images(path=path, chemannot=chemannot,
                                                drug='vinblastine sulfate',
                                                which=0)

In [None]:
from bioimg import read_image, load_image_series
from bioimg import plot_channels, combine_channels
imgs = [read_image(fname=os.path.join(imgpath, f)) for f in cmpd_imgs if 's1' in f]

In [None]:
col_params={'colors': ['blue', 
                       'yellow',
                       'green', 
                       'white', 
                       'red'],
            'blend': [2, 0.5, 1, 0.5, 1],
            'gamma': [0.5, 0.8,0.6,0.5,0.7]}

In [None]:
plt.figure(figsize=(10,8))
plt.imshow(combine_channels(imgs, **col_params))
plt.axis('off')

In [None]:
dmso_imgs = [read_image(fname=os.path.join(imgpath, f)) for f in ctrl_imgs if 's1' in f]

In [None]:
plt.figure(figsize=(10,8))
plt.imshow(combine_channels(dmso_imgs, **col_params))
plt.axis('off')

## Segmentation-Free profiling

**Load images (Hoechst) and run K-means to cluster the blocks, save centroids**

In [None]:
drugs = ['vinblastine sulfate',
         'dorsomorphin',
         'amthamine',
         'carboxin',
         'resveratrol',
         'KN-93']

In [None]:
imglist = []
for d in drugs:
    for i in range(3):
        imgpath, cmpd_imgs, ctrl_imgs = load_drug_images(path=path, chemannot=chemannot,
                                                    drug=d,
                                                    which=i)
        fview = 's3'
        imgs = [read_image(fname=os.path.join(imgpath, f), verbose=False) for f in cmpd_imgs if fview in f]
        dmso_imgs = [read_image(fname=os.path.join(imgpath, f), verbose=False) for f in ctrl_imgs if fview in f]

        imglist.append(np.stack(imgs, axis=-1))
        imglist.append(np.stack(dmso_imgs, axis=-1))

In [None]:
hoechst = [img[:,:,0] for img in imglist]

In [None]:
from bioimg import threshold_img
from skimage.util import img_as_ubyte
def preprocess(imgs, thresh=True):
    if thresh:
        imgs = [threshold_img(img, method='otsu') for img in imgs]
    imgs = [img_as_ubyte(img) for img in imgs]
    return imgs

In [None]:
train_imgs = preprocess(imgs=hoechst)

In [None]:
from bioimg import SegfreeProfiler
segf = SegfreeProfiler(tile_size=(10,10),
                       n_block_types=20,
                       n_supblock_types=20)

In [None]:
tiles = segf.tile_images(train_imgs)

In [None]:
'''plot_channels(images=[tiles[1][i] for i in range(tiles[1].shape[0])],
              nrow=10, ncol=12, wspace=0.05, hspace=0.05)'''

In [None]:
#segf = segf.fit(train_imgs)

In [None]:
segf_prof = segf.fit_transform(train_imgs)

Re-run everything above:

In [None]:
#segf_prof = segf.transform(train_imgs)

In [None]:
segf_prof.shape

In [None]:
import seaborn as sn
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-4).fit(segf_prof)
sn.clustermap(segf_prof.loc[:,sel.get_support()])

In [None]:
segf_prof.columns[sel.get_support()]

## Aggregate all channels

In [None]:
#segf = segf.fit(imglist)
segf_prof = segf.fit_transform(imglist)

In [None]:
import seaborn as sn
plt.plot(np.cumsum(segf.pca.explained_variance_ratio_), linewidth=3)
sn.despine()
plt.axhline(y=1, color='black', linestyle=':')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance')

In [None]:
nchan = 5
eigentiles = segf.pca.components_.reshape((segf.n_components, *segf.tile_size, nchan))

In [None]:
plot_channels([combine_channels([eigentiles[j,:,:,i] for i in range(5)], **col_params) for j in range(50)],
              nrow=5, ncol=10, scale_x=2, scale_y=2)
plt.show()

In [None]:
#segf_prof = segf.transform(imglist)

In [None]:
sel = VarianceThreshold(threshold=1e-4).fit(segf_prof)
sn.clustermap(segf_prof.loc[:,sel.get_support()])

Normalize each channel before fitting:

In [None]:
def minmax_scale(a):
    return (a - a.min()) / (a.max() - a.min())

def normalize_channels(img):
    return np.stack([minmax_scale(img[:,:,i]) for i in range(img.shape[-1]) ], axis=-1)

In [None]:
imgs_norm = [normalize_channels(img) for img in imglist ]

In [None]:
#segf = segf.fit(imglist)
segf_prof = segf.fit_transform(imgs_norm)

In [None]:
nchan = 5
eigentiles = segf.pca.components_.reshape((segf.n_components, *segf.tile_size, nchan))
plot_channels([combine_channels([eigentiles[j,:,:,i] for i in range(5)], **col_params) for j in range(50)],
              nrow=5, ncol=10, scale_x=2, scale_y=2)
plt.show()

In [None]:
sel = VarianceThreshold(threshold=1e-4).fit(segf_prof)
sn.clustermap(segf_prof.loc[:,sel.get_support()])

In [None]:
segf_prof.loc[:,sel.get_support()].columns