## Morphological Feature Selection

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import re
import os
import sys
import h5py
from sklearn.manifold import TSNE

javabridge.start_vm(class_path=bf.JARS)

Function for loading viable cancer cells (`class = 2` output by the object detection model):

In [None]:
def load_viable_cells(platedir, wells, annot):
    imgdf = []
    for w in wells:
        df = pd.read_csv(os.path.join(platedir, w+'.csv'))
        imgdf.append(df[df['class'] == 2])
    imgdf = pd.concat(imgdf).reset_index(drop=True)
    labels = imgdf[['class', 'file']]
    imgdf = imgdf.drop(['class', 'file'], axis=1)
    labels['well'] = labels['file'].replace(regex=r'f[0-9].+', value='')
    labels['class'] = labels['class'].apply(lambda x: 'Viable' if x == 2 else 'Apoptotic')    
    labels = pd.merge(labels, annot, on='well')
    return imgdf, labels

Load plate annotation file:

In [None]:
# load plate annotation
annot_df = pd.read_csv('../data/AML_trainset/drugannot.txt',
                      sep='\t')

In [None]:
annot_df.head()

In [None]:
dmso = annot_df[annot_df.Drug == 'DMSO'].reset_index(drop=True)
dmso_wells = dmso['well'].unique()

Load viable cells from DMSO control wells:

In [None]:
ctrl_df, ctrl_annot = load_viable_cells(platedir='../imgdata/181109_Plate1',
                            wells=dmso_wells, annot=dmso)

Remove features with near-zero variance, choose `thresh=1e-12`:

In [None]:
from bioimg.singlecell import select_features
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-12).fit(ctrl_df)

In [None]:
ctrl_df = select_features(df=ctrl_df, sel=sel)

Apply generalized logarithm-transform:

$$g(x) =  \log{\left(\frac{x+\sqrt{x^2+c^2}}{2}+ \varepsilon\right) }$$

with $c$, minimum value of each feature, and $\varepsilon$, a small number that prevents from taking log of zero.

In [None]:
eps = 1e-8
ctrl_df = ctrl_df.apply(lambda x: np.log((x + np.sqrt(x**2 + x.min()**2))/2 + eps))

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data

In [None]:
scaler = StandardScaler().fit(ctrl_df)
ctrl_scaled = scale_data(ctrl_df, scaler=scaler).reset_index(drop=True)

In [None]:
check_data(ctrl_scaled)

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(ctrl_scaled.T),
                       index=ctrl_scaled.columns,
                       columns=ctrl_scaled.columns )

In [None]:
from bioimg.singlecell import plot_heatmap
plot_heatmap(featcor)

Check out features with few correlations:

In [None]:
featcor.columns[featcor.abs().sum(axis=1) < 30].values[:30]

## Feature selection based on replicate correlation
Load all wells from 2 biological replicate plates:

In [None]:
all_wells = annot_df['well'].values

In [None]:
rep1_df, rep1_annot = load_viable_cells(platedir='../imgdata/181109_Plate1', 
                                        wells=all_wells,
                                       annot=annot_df)

Remove zero-variance features:

In [None]:
rep1_df = select_features(rep1_df, sel=sel)

In [None]:
rep1_df = rep1_df.apply(lambda x: np.log((x + np.sqrt(x**2 + x.min()**2))/2 + eps))

In [None]:
scaler = StandardScaler().fit(rep1_df[rep1_annot['Drug']=='DMSO'])
rep1_scaled = scale_data(rep1_df, scaler=scaler).reset_index(drop=True)

In [None]:
rep2_df, rep2_annot = load_viable_cells(platedir='../imgdata/181109_Plate4', 
                                        wells=all_wells,
                                       annot=annot_df)

Also remove zero-variance features from the second biological replicate:

In [None]:
rep2_df = select_features(rep2_df, sel=sel)

In [None]:
rep2_df = rep2_df.apply(lambda x: np.log((x + np.sqrt(x**2 + x.min()**2))/2 + eps))

In [None]:
scaler = StandardScaler().fit(rep2_df[rep2_annot['Drug']=='DMSO'])
rep2_scaled = scale_data(rep2_df, scaler=scaler).reset_index(drop=True)

In [None]:
rep1_df.shape[1] == rep2_df.shape[1]

In [None]:
rep1_scaled.shape[1] == rep2_scaled.shape[1]

In [None]:
from bioimg.singlecell import aggregate_profiles

Compute mean well profiles for both replicate plates:

In [None]:
'''prof_rep1 = aggregate_profiles(rep1_df, rep1_annot)
prof_rep2 = aggregate_profiles(rep2_df, rep2_annot)'''
prof_rep1 = aggregate_profiles(rep1_scaled, rep1_annot)
prof_rep2 = aggregate_profiles(rep2_scaled, rep2_annot)

Make sure that the same wells are present in both biological replicates:

In [None]:
prof_rep1 = prof_rep1[np.isin(prof_rep1['well'], prof_rep2['well'])]
prof_rep2 = prof_rep2[np.isin(prof_rep2['well'], prof_rep1['well'])]

In [None]:
repcor = prof_rep1.sort_values(by='well').corrwith(prof_rep2.sort_values(by='well'))

In [None]:
sel_feats = repcor[repcor >= 0.6].index.values
X_subset = ctrl_scaled[sel_feats]

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor)

In [None]:
repcor[[f for f in sel_feats if 'Calcein' in f]]

In [None]:
repcor[[f for f in sel_feats if 'Hoechst' in f]]

In [None]:
repcor[[f for f in sel_feats if 'Lysosomal' in f]]

## Feature Selection Based on Residuals
Select initial feature set:

In [None]:
import statsmodels.api as sm

def get_residuals(df, y):
    X = sm.add_constant(df)
    lm = sm.OLS(y, X).fit()
    return lm.resid.values

def get_cor_residuals(rep1, rep2, sel, col):
    resid1 = get_residuals(df=rep1[sel], y=rep1[col])
    resid2 = get_residuals(df=rep2[sel], y=rep2[col])
    return np.corrcoef(x=resid1, y=resid2)[0,1]

In [None]:
sel = ['ch-Calcein-eccentricity', 
       'ch-Hoechst-mean_intensity',
       'ch-Lysosomal-mean_intensity']
feats_to_check = np.setdiff1d(sel_feats, sel)
stop_criterion = 1
while stop_criterion > 0.5:
    feats_to_check = np.setdiff1d(sel_feats, sel)
    # correlations of residuals
    resid_cor = np.array([get_cor_residuals(rep1=prof_rep1, rep2=prof_rep2,
                      sel=sel, col=col) for col in feats_to_check])
    sel = sel + [feats_to_check[np.argmax(resid_cor)]]
    stop_criterion = np.sum(resid_cor > 0) / len(resid_cor)

In [None]:
print("Number of selected features: %d" % len(sel))

In [None]:
X_subset = ctrl_scaled[sel]
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor, xticklabels=True, size=(12,12))

## Further Feature Selection

In [None]:
y=np.copy(ctrl_annot['Culture'].values)
y[y=='Mono-culture'] = 0
y[y=='Co-culture'] = 1
y = y.astype(int)

Univariate feature selection method:

In [None]:
from sklearn.feature_selection import SelectKBest, SelectFdr, f_classif
from bioimg.singlecell import select_features
sel = SelectKBest(f_classif, k=20).fit(X=X_subset, y=y)
X_kbest = select_features(df=X_subset, sel=sel)

In [None]:
X_kbest.columns

In [None]:
sel = SelectFdr(f_classif, alpha=0.01).fit(X=X_subset, y=y)
X_fdr = select_features(df=X_subset, sel=sel)

In [None]:
X_fdr.shape

In [None]:
X_subset.columns[:20]

In [None]:
from bioimg.singlecell import recursive_elim

In [None]:
rfe = recursive_elim(df=X_subset, y=y, n_feat=9, elim_step=1)

In [None]:
X_rfe = select_features(df=X_subset, sel=rfe)

In [None]:
X_rfe.columns

## Visualize Single Cells Based on Selected Features

In [None]:
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(X_subset)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), ctrl_annot], axis=1)

In [None]:
feat_subset =X_rfe.columns.values
Xfeat = ctrl_scaled.loc[:,feat_subset]
X_df = pd.concat([X_df, Xfeat], axis=1)

In [None]:
from bioimg.singlecell import plot_dimred
plot_dimred(X_df, 
            hue='Culture',
            title='DMSO control wells')

In [None]:
from bioimg.singlecell import facet_dimred
facet_dimred(X_df, feat_subset=feat_subset,
            nrows=3, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture',
             sharey=False)

## Constructing Non-Redundant Set of Features

In [None]:
sel = ['ch-Calcein-eccentricity', 
       'ch-Hoechst-mean_intensity',
       'ch-Lysosomal-mean_intensity']
stop_criterion = 0
both = pd.concat([rep1_scaled, rep2_scaled], axis=0)
candidate_feats = both[sel_feats]
cor_df = candidate_feats.corr()

In [None]:
while stop_criterion < 0.5:
    feats_to_check = np.setdiff1d(sel_feats, sel)
    cand_cor = cor_df[sel]
    cand_cor = cand_cor[np.isin(cand_cor.index, feats_to_check)]
    max_cor = cand_cor.max(axis=1).abs()
    stop_criterion = np.min(max_cor)
    sel = sel + [cand_cor.index[np.argmin(max_cor)]]

In [None]:
X_subset = ctrl_scaled[sel]
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor, vmax=1, vmin=-1,
             xticklabels=True, size=(10,12))

In [None]:
print("Number of non-redundant features: %d" % len(sel))

In [None]:
rfe = recursive_elim(df=X_subset, y=y, n_feat=9, elim_step=1)
X_rfe = select_features(df=X_subset, sel=rfe)

In [None]:
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(X_subset)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), ctrl_annot], axis=1)

In [None]:
feat_subset =X_rfe.columns.values
Xfeat = ctrl_scaled.loc[:,feat_subset]
X_df = pd.concat([X_df, Xfeat], axis=1)

In [None]:
from bioimg.singlecell import plot_dimred
plot_dimred(X_df, 
            hue='Culture',
            title='DMSO control wells')

In [None]:
from bioimg.singlecell import facet_dimred
facet_dimred(X_df, feat_subset=feat_subset,
            nrows=3, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
facet_density(X_long, feat_column='feature',
             ncols=3,  hue='Culture', sharey=False)