# Dimension Reduction of Single-Cell Coculture Data
Morphological feature space is high-dimensional, often with hundreds of features. In order to explore the structure of the data, one can use dimensionality reduction methodst to embed data in lower dimensions (in 2D)

## DMSO Control Wells
We will start with DMSO control wells and we will also use these for feature standardization:

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import re
import os
import sys
import h5py
import json
from sklearn.manifold import TSNE

javabridge.start_vm(class_path=bf.JARS)

Simple function for loading pre-computed morphological features:

In [None]:
def load_cells(platedir, wells, annot, which=[1,2]):
    imgdf = []
    for w in wells:
        df = pd.read_csv(os.path.join(platedir, w+'.csv'))
        imgdf.append(df[np.isin(df['class'], which)])
    imgdf = pd.concat(imgdf).reset_index(drop=True)
    labels = imgdf[['class', 'file']]
    imgdf = imgdf.drop(['class', 'file'], axis=1)
    labels['well'] = labels['file'].replace(regex=r'f[0-9].+', value='') 
    labels = pd.merge(labels, annot, on='well')
    return imgdf, labels

Choose a plate and load the plate map:

In [None]:
platedir = '../imgdata/180528_Plate3/'
# load plate annotation
annot_df = pd.read_csv('../data/AML_trainset/drugannot.txt',
                      sep='\t')

Load only control wells:

In [None]:
dmso = annot_df[annot_df.Drug == 'DMSO'].reset_index(drop=True)
dmso_wells = dmso['well'].unique()

In [None]:
imgdf, labels = load_cells(platedir=platedir, wells=dmso_wells, annot=annot_df)

Map integer classes to their names:

In [None]:
labels['class'] = labels['class'].apply(lambda x: 'Viable' if x == 2 else 'Apoptotic')

In [None]:
from sklearn.feature_selection import VarianceThreshold
from bioimg.singlecell import preprocess_data
sel = VarianceThreshold(threshold=1e-12).fit(imgdf)
imgdf = preprocess_data(df=imgdf, sel=sel, glog=True)

Standardize data (center and scale):

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data
scaler = StandardScaler().fit(X=imgdf)
imgdf_scaled = scale_data(imgdf, scaler=scaler)

In [None]:
check_data(imgdf_scaled)

In [None]:
def subset_ncells(imgdf, labels, n):
    subset = np.random.choice(imgdf.index, size=n)
    imgdf_s = imgdf.loc[subset,:].reset_index(drop=True)
    labels_s = labels.loc[subset,:].reset_index(drop=True)
    return imgdf_s, labels_s

Subset to $\approx$ 2000 cells to speed up embedding:

In [None]:
np.random.seed(2906)
imgdf_sub, labels_sub = subset_ncells(imgdf=imgdf_scaled, labels=labels, n=2000)

Run t-SNE on the first 20 principal components:

In [None]:
from sklearn.decomposition import PCA
pcs = PCA(n_components=20).fit_transform(imgdf_sub)
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(pcs)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), labels_sub], axis=1)

In [None]:
from bioimg.singlecell import plot_dimred
plot_dimred(X_df, 
            hue='Culture',
            style='class',
            title='DMSO control wells',
            style_order=['Viable', 'Apoptotic'])
plt.legend(loc='lower right',
           bbox_to_anchor=(1.3,0.3))

## Dimension reduction in viable leukemia cells
We are not interested in coculture-monoculture difference in apoptotic cell population. Load only viable cells from DMSO wells:

In [None]:
# which=2 selects only 'Viable' cancer cells
ctrl_df, ctrl_annot = load_cells(platedir=platedir, wells=dmso_wells, annot=annot_df, which=2)

In [None]:
sel = VarianceThreshold(threshold=1e-12).fit(ctrl_df)
ctrl_df = preprocess_data(df=ctrl_df, sel=sel, glog=True)

In [None]:
scaler = StandardScaler().fit(X=ctrl_df)
ctrl_scaled = scale_data(ctrl_df, scaler=scaler)

Run t-SNE on the first 20 principal components:

In [None]:
pcs = PCA(n_components=20).fit_transform(ctrl_scaled)
X_tsne = TSNE(n_components=2, random_state=34, perplexity=30).fit_transform(pcs)

In [None]:
X_ctrl = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']),
                    ctrl_annot], axis=1)

In [None]:
plot_dimred(X_ctrl, hue='Culture',
           title='Viable cells in control wells')
plt.legend(loc='lower right',
           bbox_to_anchor=(1.2,0.05))

## Load Feature Selection Dictionary
In `feature-selection.ipynb` we applied a number of methods to identify reproducible, non-redundant morphological features. Let's read in the feature sets:

In [None]:
with open('featselect.json') as file:
    featdict = json.load(file)

In [None]:
{k : len(v) for k,v in featdict.items()}

+ `repcor` features: reproducible features with biological replicate correlation > 0.6
+ `residcor`: non-redundant feature set determined by correlation of regression residuals
+ `rfe`: recursive feature elimination (RFE) with top 9 features that separate coculture from monoculture
+ `uncorr`: uncorrelated features by greedy search
+ `uncorr_rfe`: RFE on the uncorrelated feature set (`uncorr`) with top 6 features that separate coculture from monoculture

A robust method of identifying non-redundant features is based on correlation of regression residuals: an initial feature set is used to fit linear models to predict all other features, the correlation between replicate residuals is computed and in each iteration a feature with the highest residual correlation is added into the feature set.

We will use the `residcor` features to identify what morphological variables separate viable cells in coculture from those in monoculture.

In [None]:
from bioimg.singlecell import plot_heatmap
# feature correlation
X_subset = ctrl_scaled[featdict['residcor']]
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor, xticklabels=True, size=(12,14))

Use recursive feature elimination (RFE) to identify top 9 features that separate leukemia cells in coculture from those in monoculture:

In [None]:
from bioimg.singlecell import recursive_elim, select_features

y=np.copy(ctrl_annot['Culture'].values)
y[y=='Mono-culture'] = 0
y[y=='Co-culture'] = 1
y = y.astype(int)

rfe = recursive_elim(df=X_subset, y=y, n_feat=9, elim_step=1)
X_rfe = select_features(df=X_subset, sel=rfe)
feat_subset = X_rfe.columns.values

Top 9 features that differentiate coculture from monoculture cancer cells are:

In [None]:
feat_subset

Some of these features are difficult to interpret. However, looking at the correlation heatmap we see that
+ `ch-Calcein-moments_hu-1` is highly correlated with and can be replaced by `Calcein-eccentricity`
+ `ch-Calcein-moments_central-2-2` can be replaced by `Calcein-convex_area`
+ `ch-Lysosomal-InfoMeas2-d5-0` can be replaced by `ch-Lysosomal-mean_intensity`

In [None]:
feat_subset[feat_subset == 'ch-Calcein-moments_hu-1'] = 'ch-Calcein-eccentricity'
feat_subset[feat_subset == 'ch-Calcein-moments_central-2-2'] = 'ch-Calcein-convex_area'
feat_subset[feat_subset == 'ch-Lysosomal-InfoMeas2-d5-0'] = 'ch-Lysosomal-mean_intensity'

In [None]:
X_ctrl = pd.concat([X_ctrl, ctrl_scaled.loc[:,feat_subset]], axis=1)

In [None]:
plot_dimred(X_ctrl, hue='Culture',
           title='Viable cells in control wells')
plt.legend(loc='lower right',
           bbox_to_anchor=(1.2,0.05))

In [None]:
from bioimg.singlecell import facet_dimred
facet_dimred(X_ctrl, feat_subset=feat_subset,
            nrows=3, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_ctrl[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture',
             sharey=False, size=(10,8),
             aspect=1.4)

Embed using UMAP:

In [None]:
'''
import umap
X_umap = umap.UMAP(random_state=29,
                   n_neighbors=30,
                   min_dist=0.2,
                   metric='correlation').fit_transform(pcs)
X_ctrl = pd.concat([pd.DataFrame(X_umap, columns=['umap1', 'umap2']), ctrl_annot], axis=1)
plot_dimred(X_ctrl, dims='umap', hue='Culture',
           title='Viable cells in control wells')
plt.legend(loc='lower right',
           bbox_to_anchor=(1.2,0.05))
'''

## Dimension Reduction on Drug Wells

Now load some drug wells:

In [None]:
drug_sel = ['Tofacitinib', 'Midostaurin',
                   'Ganetespib', 'Lenalidomide',
                   'Pyridone 6', 'UMI-77',
                   'Bafilomycin A1', 
                   'Quizartinib', 'Hydroxychloroquine',
                   'Fludarabine', 'Vorinostat',
                   'Thioguanine', 'Nutlin 3a',
                   'Palbociclib', 'Carfilzomib',
                   'JQ1', 'Cytarabine',
                   'BAY61-3606', 'Venetoclax',
                   'Ixazomib']

In [None]:
drugs = annot_df[np.isin(annot_df.Drug, drug_sel) ].reset_index(drop=True)
drug_wells = drugs['well'].unique()

In [None]:
# which=2 selects only 'Viable' cancer cells
drug_df, drug_annot = load_cells(platedir=platedir, wells=drug_wells, annot=annot_df, which=2)

In [None]:
drug_df = preprocess_data(df=drug_df, sel=sel, glog=True)
# scale by control wells
drug_scaled = scale_data(drug_df, scaler=scaler)

In [None]:
pcs = PCA(n_components=20).fit_transform(drug_scaled)
X_tsne = TSNE(n_components=2, random_state=21, perplexity=50).fit_transform(pcs)

In [None]:
X_df = pd.concat([pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2']), drug_annot], axis=1)
X_df = pd.concat([X_df, drug_scaled.loc[:,feat_subset]], axis=1)
#X_df = pd.concat([X_df, Xfeat], axis=1)

In [None]:
plot_dimred(X_df, hue='Culture',
           title='Viable cells in drug-treated wells')

In [None]:
facet_dimred(X_df, feat_subset=feat_subset,
            nrows=3, ncols=3)

In [None]:
colsub = ['Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_density
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture', sharey=False,
             aspect=1.4)

In [None]:
colsub = ['Drug', 'Culture'] + feat_subset.tolist()
X_long = pd.melt(X_df[colsub], 
                 id_vars=['Drug', 'Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
from bioimg.singlecell import facet_boxplot
facet_boxplot(X_long, x='Drug',
              y='val', feat_column='feature',
              ncols=3,
              nrows=3, hue='Culture')

In [None]:
drug_chunks = [drug_sel[i:i + 5] for i in range(0, len(drug_sel), 5)]

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=2,
                       figsize = (14,12))
sn.set(font_scale=1.2)
sn.set_style('white')
sn.despine()
for r in range(2):
    for c in range(2):
        sn.scatterplot(x = 'tsne1', y = 'tsne2', data=X_df[np.isin(X_df['Drug'], drug_chunks[r*2+c])],
               hue = 'Drug',
               s = 40, alpha = 0.8, ax=ax[r,c])
        ax[r,c].legend(loc='lower right', bbox_to_anchor=(1.4,0.7))
        ax[r,c].set_xlabel('TSNE 1')
        ax[r,c].set_ylabel('TSNE 2')
fig.subplots_adjust(wspace=0.5)