## Morphological Feature Selection

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import re
import os
import sys
import h5py
import json
from sklearn.manifold import TSNE

javabridge.start_vm(class_path=bf.JARS)

Function for loading viable cancer cells (`class = 2` output by the object detection model):

In [None]:
def load_viable_cells(platedir, wells, annot):
    imgdf = []
    for w in wells:
        df = pd.read_csv(os.path.join(platedir, w+'.csv'))
        imgdf.append(df[df['class'] == 2])
    imgdf = pd.concat(imgdf).reset_index(drop=True)
    labels = imgdf[['class', 'file']]
    imgdf = imgdf.drop(['class', 'file'], axis=1)
    labels['well'] = labels['file'].replace(regex=r'f[0-9].+', value='')
    labels['class'] = labels['class'].apply(lambda x: 'Viable' if x == 2 else 'Apoptotic')    
    labels = pd.merge(labels, annot, on='well')
    return imgdf, labels

Load plate annotation file:

In [None]:
# load plate annotation
annot_df = pd.read_csv('../data/AML_trainset/drugannot.txt',
                      sep='\t')

In [None]:
annot_df.head()

In [None]:
dmso = annot_df[annot_df.Drug == 'DMSO'].reset_index(drop=True)
dmso_wells = dmso['well'].unique()

Load viable cells from DMSO control wells:

In [None]:
ctrl_df, ctrl_annot = load_viable_cells(platedir='../imgdata/181109_Plate1',
                            wells=dmso_wells, annot=dmso)

Remove features with near-zero variance, choose `thresh=1e-8`:

In [None]:
from bioimg.singlecell import select_features
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-8).fit(ctrl_df)

In [None]:
ctrl_df = select_features(df=ctrl_df, sel=sel)

Apply generalized logarithm-transform:

$$g(x) =  \log{\left(\frac{x+\sqrt{x^2+c^2}}{2}+ \varepsilon\right) }$$

with $c$, minimum value of each feature, and $\varepsilon$, a small number that prevents from taking log of zero.

In [None]:
def glog_transform(df, eps=1e-8):
    return df.apply(lambda x: np.log((x + np.sqrt(x**2 + x.min()**2))/2 + eps))

In [None]:
ctrl_df = glog_transform(df=ctrl_df)

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data

In [None]:
scaler = StandardScaler().fit(ctrl_df)
ctrl_scaled = scale_data(ctrl_df, scaler=scaler).reset_index(drop=True)

In [None]:
check_data(ctrl_scaled)

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(ctrl_scaled.T),
                       index=ctrl_scaled.columns,
                       columns=ctrl_scaled.columns )

In [None]:
from bioimg.singlecell import plot_heatmap
plot_heatmap(featcor)

Check out features with few correlations:

In [None]:
featcor.columns[featcor.abs().sum(axis=1) < 30].values[:30]

## Feature selection based on replicate correlation
Load all wells from 2 biological replicate plates:

In [None]:
all_wells = annot_df['well'].values

In [None]:
rep1_df, rep1_annot = load_viable_cells(platedir='../imgdata/181109_Plate1', 
                                        wells=all_wells,
                                           annot=annot_df)

In [None]:
from bioimg.singlecell import preprocess_data
rep1_df = preprocess_data(df=rep1_df, sel=sel, glog=True)
# center and scale by control wells
scaler = StandardScaler().fit(rep1_df[rep1_annot['Drug']=='DMSO'])
rep1_scaled = scale_data(rep1_df, scaler=scaler)

In [None]:
rep2_df, rep2_annot = load_viable_cells(platedir='../imgdata/181109_Plate4', 
                                        wells=all_wells,
                                       annot=annot_df)

In [None]:
rep2_df = preprocess_data(df=rep2_df, sel=sel, glog=True)
# center and scale by control wells
scaler = StandardScaler().fit(rep2_df[rep2_annot['Drug']=='DMSO'])
rep2_scaled = scale_data(rep2_df, scaler=scaler)

In [None]:
rep1_df.shape[1] == rep2_df.shape[1]

In [None]:
rep1_scaled.shape[1] == rep2_scaled.shape[1]

Compute mean well profiles for both replicate plates:

In [None]:
from bioimg.singlecell import aggregate_profiles
prof_rep1 = aggregate_profiles(rep1_scaled, rep1_annot)
prof_rep2 = aggregate_profiles(rep2_scaled, rep2_annot)

In [None]:
# Make sure that the same wells are present in both replicates
prof_rep1 = prof_rep1[np.isin(prof_rep1['well'], prof_rep2['well'])]
prof_rep2 = prof_rep2[np.isin(prof_rep2['well'], prof_rep1['well'])]

Simple function to compute well-wise correlation between every morphological feature measured in two biological replicate plates:

In [None]:
def get_repcor(prof1, prof2):
    repcor = prof1.sort_values(by='well').corrwith(prof2.sort_values(by='well'))
    return repcor

In [None]:
repcor = get_repcor(prof_rep1, prof_rep2)

In [None]:
sel_feats = repcor[repcor >= 0.6].index.values
X_subset = ctrl_scaled[sel_feats]

In [None]:
# create a dictionary with various selected feature lists
featdict = dict()
featdict['repcor'] = sel_feats

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor)

In [None]:
repcor[[f for f in sel_feats if 'Calcein' in f]]

In [None]:
repcor[[f for f in sel_feats if 'Hoechst' in f]]

In [None]:
repcor[[f for f in sel_feats if 'Lysosomal' in f]]

In [None]:
highcor = repcor[repcor > 0.6]

In [None]:
highcor_df = pd.DataFrame({'feature': highcor.index, 'repcor': highcor.values})

In [None]:
highcor_df['channel'] = np.NaN

In [None]:
highcor_df.loc[highcor_df['feature'].str.contains('Lysosomal'),'channel'] = 'Lysosomal'
highcor_df.loc[highcor_df['feature'].str.contains('Hoechst'),'channel'] = 'Hoechst'
highcor_df.loc[highcor_df['feature'].str.contains('Calcein'),'channel'] = 'Calcein'

In [None]:
colors = ["medium green", "denim blue", "pale red"]
pal = sn.xkcd_palette(colors)

In [None]:
plt.figure(figsize=(5,4))
sn.barplot(data=highcor_df.groupby('channel', 
                                   as_index=False).agg('count'), 
           y='channel', x='feature', palette=pal)
plt.xlabel('Number of features ($r>0.6$)')
plt.ylabel('')
sn.despine()
#plt.savefig('../figures/repcor-features.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
plt.plot([-2, 0.5], [-2, 0.5], linewidth=1.5, linestyle='--', color='black')
sn.scatterplot(x='ch-Calcein-eccentricity_x', y='ch-Calcein-eccentricity_y',
               data=pd.merge(prof_rep1[['well', 'ch-Calcein-eccentricity']],
          prof_rep2[['well', 'ch-Calcein-eccentricity']], on='well'), ax=ax,
              facecolor='green')

sn.despine()
ax.annotate("r = {:.2f}".format(repcor['ch-Calcein-eccentricity']),
                xy=(.1, .9), xycoords=ax.transAxes)
ax.set_xlim((-2,0.5))
ax.set_ylim((-2,0.5))

plt.xlabel('Calcein eccentricity (biol rep 1)')
plt.ylabel('Calcein eccentricity (biol rep 2)')
xticks = ax.xaxis.get_major_ticks() 
xticks[0].label1.set_visible(False)
yticks = ax.yaxis.get_major_ticks() 
yticks[0].label1.set_visible(False)
sn.set_palette(sn.xkcd_palette(['medium green']))
sn.set(font_scale=1.4, style='white')
#fig.savefig('../figures/eccentricity-repcor.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
plt.plot([-3, 2], [-3, 2], linewidth=1.5, linestyle='--', color='black')
sn.scatterplot(x='ch-Lysosomal-mean_intensity_x', y='ch-Lysosomal-mean_intensity_y',
               data=pd.merge(prof_rep1[['well', 'ch-Lysosomal-mean_intensity']],
                             prof_rep2[['well', 'ch-Lysosomal-mean_intensity']], on='well'), ax=ax,
              facecolor='firebrick')
sn.despine()
ax.annotate("r = {:.2f}".format(repcor['ch-Lysosomal-mean_intensity']),
            xy=(.1, .9), xycoords=ax.transAxes)
ax.set_xlim((-3.1,2.1))
ax.set_ylim((-3.1,2.1))
plt.xlabel('Mean lysosomal intensity (biol rep 1)')
plt.ylabel('Mean lysosomal intensity (biol rep 2)')
xticks = ax.xaxis.get_major_ticks() 
xticks[0].label1.set_visible(False)
yticks = ax.yaxis.get_major_ticks() 
yticks[0].label1.set_visible(False)
sn.set(font_scale=1.4, style='white')
#fig.savefig('../figures/lysos-intensity-repcor.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
plt.plot([-2, 2], [-2, 2], linewidth=1.5, linestyle='--', color='black')
sn.scatterplot(x='ch-Hoechst-InfoMeas1-d7-3_x', y='ch-Hoechst-InfoMeas1-d7-3_y',
               data=pd.merge(prof_rep1[['well', 'ch-Hoechst-InfoMeas1-d7-3']],
                             prof_rep2[['well', 'ch-Hoechst-InfoMeas1-d7-3']], on='well'), ax=ax)
sn.despine()
ax.annotate("r = {:.2f}".format(repcor['ch-Hoechst-InfoMeas1-d7-3']),
            xy=(.1, .9), xycoords=ax.transAxes)
ax.set_xlim((-2.1,2))
ax.set_ylim((-2.1,2))
plt.xlabel('Hoechst InfoMeas1 [$d=7$] (biol rep 1)')
plt.ylabel('Hoechst InfoMeas1 [$d=7$] (biol rep 2)')
xticks = ax.xaxis.get_major_ticks() 
xticks[0].label1.set_visible(False)
yticks = ax.yaxis.get_major_ticks() 
yticks[0].label1.set_visible(False)
sn.set(font_scale=1.4, style='white')
#fig.savefig('../figures/Hoechst-infomeas-repcor.pdf', bbox_inches='tight')

In [None]:
prof_rep1 = prof_rep1[sel_feats]
prof_rep2 = prof_rep2[sel_feats]

## Feature Selection Based on Residuals
Select initial feature set:

In [None]:
from bioimg.singlecell import select_residcor
sel_feats = select_residcor(prof1=prof_rep1, prof2=prof_rep2,
                            sel = ['ch-Calcein-eccentricity', 
                                   'ch-Hoechst-mean_intensity',
                                   'ch-Lysosomal-mean_intensity'])

In [None]:
print("Number of selected features: %d" % len(sel_feats))

In [None]:
featdict['residcor'] = sel_feats

In [None]:
X_subset = ctrl_scaled[sel_feats]
featnames = [f.replace('ch-', '') for f in X_subset.columns]
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=featnames,
                       columns=featnames )
plot_heatmap(featcor, xticklabels=True, size=(12,14))
#plt.savefig('../figures/residcor-featsel.pdf', bbox_inches='tight')

## Further Feature Selection

In [None]:
y=np.copy(ctrl_annot['Culture'].values)
y[y=='Mono-culture'] = 0
y[y=='Co-culture'] = 1
y = y.astype(int)

Univariate feature selection method:

In [None]:
from sklearn.feature_selection import SelectKBest, SelectFdr, f_classif
from bioimg.singlecell import select_features
sel = SelectKBest(f_classif, k=20).fit(X=X_subset, y=y)
X_kbest = select_features(df=X_subset, sel=sel)

In [None]:
X_kbest.columns

In [None]:
sel = SelectFdr(f_classif, alpha=0.01).fit(X=X_subset, y=y)
X_fdr = select_features(df=X_subset, sel=sel)

In [None]:
X_fdr.shape

In [None]:
X_subset.columns[:20]

In [None]:
from bioimg.singlecell import recursive_elim

In [None]:
rfe = recursive_elim(df=X_subset, y=y, n_feat=9, elim_step=1)

In [None]:
X_rfe = select_features(df=X_subset, sel=rfe)

## Visualize Single Cells Based on Selected Features

In [None]:
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(X_subset)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), ctrl_annot], axis=1)

In [None]:
feat_subset =X_rfe.columns.values
Xfeat = ctrl_scaled.loc[:,feat_subset]
X_df = pd.concat([X_df, Xfeat], axis=1)

In [None]:
from bioimg.singlecell import plot_dimred
plot_dimred(X_df, 
            hue='Culture',
            title='DMSO control wells')

In [None]:
from bioimg.singlecell import facet_dimred
facet_dimred(X_df, feat_subset=feat_subset,
            nrows=3, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture',
             sharey=True, size=(14,10), xlim=(-5,5))

## Constructing Non-Redundant Set of Features

In [None]:
both = pd.concat([rep1_scaled, rep2_scaled], axis=0)
candidate_feats = both[sel_feats]

In [None]:
from bioimg.singlecell import select_uncorrelated
sel_feats = select_uncorrelated(df=candidate_feats, 
                                sel=['ch-Calcein-eccentricity', 
       'ch-Hoechst-mean_intensity',
       'ch-Lysosomal-mean_intensity'])

In [None]:
featdict['uncorr'] = sel_feats

In [None]:
X_subset = ctrl_scaled[sel_feats]
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor, vmax=1, vmin=-1,
             xticklabels=True, size=(10,12))

In [None]:
print("Number of non-redundant features: %d" % len(sel_feats))

In [None]:
rfe = recursive_elim(df=X_subset, y=y, n_feat=6, elim_step=1)
X_rfe = select_features(df=X_subset, sel=rfe)

In [None]:
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(X_subset)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), ctrl_annot], axis=1)

In [None]:
feat_subset = X_rfe.columns.values
Xfeat = ctrl_scaled.loc[:,feat_subset]
X_df = pd.concat([X_df, Xfeat], axis=1)

In [None]:
from bioimg.singlecell import plot_dimred
plot_dimred(X_df, 
            hue='Culture',
            title='DMSO control wells')

In [None]:
from bioimg.singlecell import facet_dimred
facet_dimred(X_df, feat_subset=feat_subset,
            nrows=2, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
sn.set(font_scale=1.1)
sn.set_style('white')
sn.despine()
facet_density(X_long, feat_column='feature',
             ncols=3,  hue='Culture', sharey=False)

In [None]:
featdict = {k : v if type(v)==list else v.tolist() for k,v in featdict.items()}

In [None]:
with open('featselect.json', 'w') as fp:
    json.dump(featdict, fp)