# Comparison of cell subpopulations 

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import re
import os
import sys
import h5py
import json
from sklearn.manifold import TSNE

javabridge.start_vm(class_path=bf.JARS)

In [None]:
def load_cells(platedir, wells, annot, which=[1,2]):
    imgdf = []
    for w in wells:
        if os.path.exists(os.path.join(platedir, w+'.csv')):
            df = pd.read_csv(os.path.join(platedir, w+'.csv'))
            imgdf.append(df[np.isin(df['class'], which)])
    imgdf = pd.concat(imgdf).reset_index(drop=True)
    labels = imgdf[['class', 'file']]
    imgdf = imgdf.drop(['class', 'file'], axis=1)
    labels['well'] = labels['file'].replace(regex=r'f[0-9].+', value='') 
    labels = pd.merge(labels, annot, on='well')
    return imgdf, labels

## Coculture vs Monoculture Subpopulations in 180528_Plate5

In [None]:
#platedir = '/Volumes/gitlab/microscopy-notebooks/imgdata/180528_Plate5/'
platedir = '../data/imgdata/180528_Plate5/'
# load plate annotation
annot_df = pd.read_csv('../data/AML_trainset/drugannot.txt',
                      sep='\t')
hcl_annot = pd.read_csv('../data/coculture_metafiles/HCL_plate_layout.txt',
                       sep='\t')

In [None]:
patannot = pd.read_csv('../data/coculture_metafiles/patannot.txt',
                      sep='\t')

In [None]:
dmso = annot_df[annot_df.Drug == 'DMSO'].reset_index(drop=True)
dmso_wells = dmso['well'].unique()

In [None]:
# which=2 selects only 'Viable' cancer cells
ctrl_df, ctrl_annot = load_cells(platedir=platedir, wells=dmso_wells, annot=annot_df, which=2)

In [None]:
def subset_ncells(imgdf, labels, n):
    subset = np.random.choice(imgdf.index, size=n)
    imgdf_s = imgdf.loc[subset,:].reset_index(drop=True)
    labels_s = labels.loc[subset,:].reset_index(drop=True)
    return imgdf_s, labels_s

In [None]:
'''np.random.seed(107)
if ctrl_df.shape[0] > 5000:
    ctrl_df, ctrl_annot = subset_ncells(imgdf=ctrl_df, labels=ctrl_annot, n=5000)'''

Load feature selection dictionary:

In [None]:
with open('featselect.json') as file:
    featdict = json.load(file)
{k : len(v) for k,v in featdict.items()}

In [None]:
from bioimg.singlecell import preprocess_data
sel = featdict['repcor'] + ['ch-Lysosomal-area',
                            'ch-Lysosomal-extent']
ctrl_df = preprocess_data(df=ctrl_df, sel=sel, glog=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data
scaler = StandardScaler().fit(X=ctrl_df)
ctrl_scaled = scale_data(ctrl_df, scaler=scaler)

Run Mann-Whitney-Wilcoxon test:

In [None]:
#X_ctrl = ctrl_scaled[featdict['residcor']]
X_ctrl = ctrl_scaled

In [None]:
from scipy.stats import mannwhitneyu, ks_2samp
from statsmodels.stats.multitest import multipletests
pvaldf = pd.DataFrame(X_ctrl.apply(lambda x: mannwhitneyu(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'])[1]), columns=['pval'])
pvaldf['padj'] = multipletests(pvaldf['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
pvaldf[pvaldf['padj']<0.01].shape

In [None]:
med_diff = pd.DataFrame(X_ctrl.apply(lambda x: np.median(x[ctrl_annot.Culture == 'Co-culture']) -\
                          np.median(x[ctrl_annot.Culture == 'Mono-culture'])), columns=['diff_medians'])

In [None]:
pd.concat([pvaldf, med_diff], axis=1)

Two-sample Kolmogorov-Smirnov test:

In [None]:
pvaldf = pd.DataFrame(X_ctrl.apply(lambda x: ks_2samp(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'], mode='asymp')[1]), columns=['pval'])
pvaldf['padj'] = multipletests(pvaldf['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
pvaldf[pvaldf['padj']<0.01].shape

In [None]:
feat_subset = ['ch-Hoechst-mean_intensity',
               'ch-Lysosomal-mean_intensity',
               'ch-Calcein-convex_area',
               'ch-Hoechst-SumAverage-d3-0',
               'ch-Calcein-eccentricity',
               'ch-Lysosomal-InfoMeas1-d7-0',
               'ch-Hoechst-weighted_moments-0-1',
               'ch-Hoechst-InfoMeas1-d7-3',
               'ch-Lysosomal-Contrast-d7-3']
X_subset = X_ctrl[feat_subset]

In [None]:
pvaldf = pd.DataFrame(X_subset.apply(lambda x: mannwhitneyu(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'])[1]), columns=['pval'])
pvaldf['padj'] = multipletests(pvaldf['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
pvaldf

In [None]:
pvaldf = pd.DataFrame(X_subset.apply(lambda x: ks_2samp(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'], mode='asymp')[1]), columns=['pval'])
pvaldf['padj'] = multipletests(pvaldf['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
pvaldf

In [None]:
feat_subset = feat_subset + ["ch-Calcein-moments_central-2-2",
                             "ch-Hoechst-SumAverage-d3-0",
                             "ch-Calcein-moments_hu-1",
                             "ch-Lysosomal-InfoMeas1-d7-0",
                             "ch-Hoechst-InfoMeas1-d7-3",
                             "ch-Hoechst-InfoMeas1-d5-3",
                             "ch-Hoechst-SumAverage-d7-1",
                             "ch-Hoechst-SumAverage-d5-3",
                             'ch-Lysosomal-area',
                             'ch-Lysosomal-extent']
feat_subset = list(set(feat_subset))

## Aggregate All Coculture Wells

In [None]:
#path = '/Volumes/gitlab/microscopy-notebooks/imgdata/'
path = '../data/imgdata/'
plates = os.listdir(path)

In [None]:
np.random.seed(107)
X_ctrl = []
X_annot = []
pvals = []
for plate in plates:
    platedir = os.path.join(path, plate)
    if patannot[patannot['plate']== plate.replace('.csv', '')]['Diagnosis'].values[0] == 'HCL':
        ctrl_df, ctrl_annot = load_cells(platedir=platedir, 
                                     wells=dmso_wells,
                                     annot=hcl_annot, which=2)
    else:
        ctrl_df, ctrl_annot = load_cells(platedir=platedir, 
                                     wells=dmso_wells,
                                     annot=annot_df, which=2)
    if ctrl_df.shape[0] > 5000:
        ctrl_df, ctrl_annot = subset_ncells(imgdf=ctrl_df,
                                            labels=ctrl_annot, 
                                            n=5000)
    ctrl_df = preprocess_data(df=ctrl_df, sel=sel, glog=True)
    scaler = StandardScaler().fit(X=ctrl_df)
    ctrl_scaled = scale_data(ctrl_df, scaler=scaler)
    ctrl_annot['plate'] = plate
    
    X_subset = ctrl_scaled[feat_subset]
    pvaldf = pd.DataFrame(X_subset.apply(lambda x: mannwhitneyu(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'])[1]), columns=['pval'])
    pvaldf['plate'] = plate
    med_diff = pd.DataFrame(X_subset.apply(lambda x: np.median(x[ctrl_annot.Culture == 'Co-culture']) -\
                          np.median(x[ctrl_annot.Culture == 'Mono-culture'])), columns=['diff_medians'])
    pvaldf = pd.concat([pvaldf, med_diff], axis=1)
    
    X_ctrl.append(ctrl_scaled)
    X_annot.append(ctrl_annot)
    pvals.append(pvaldf)

In [None]:
pvals = pd.concat(pvals)
pvals['padj'] = multipletests(pvals['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
pvals.to_csv('../data/diff_DMSO.csv')

In [None]:
X_ctrl = pd.concat(X_ctrl)

In [None]:
X_ctrl = pd.concat([X_ctrl, pd.concat(X_annot)], axis=1)

In [None]:
colsub = ['Culture'] + feat_subset
X_long = pd.melt(X_ctrl[colsub], 
                 id_vars=['Culture'], 
                 value_vars=feat_subset,
                 var_name='feature', 
                 value_name='val')

In [None]:
X_subset = X_ctrl[feat_subset]
ctrl_annot = pd.concat(X_annot)
pvaldf = pd.DataFrame(X_subset.apply(lambda x: ks_2samp(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'], mode='asymp')[1]), columns=['pval'])
pvaldf['padj'] = multipletests(pvaldf['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
X_subset.shape

In [None]:
pvaldf

In [None]:
from bioimg.singlecell import facet_density
sn.set(font_scale=1.3)
sn.set_style('white')
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture',
             sharey=False, size=(10,8),
              hue_order=['Mono-culture', 'Co-culture'],
             aspect=1.4)
#plt.savefig('../figures/DMSO-allwells-featdist.pdf', bbox_inches='tight')

In [None]:
plt.figure(figsize=(10,10))
sn.set(font_scale=1.3)
sn.set_style('white')
g = sn.FacetGrid(X_long,
                 col='feature',
                 hue='Culture',
                 hue_order=['Mono-culture', 'Co-culture'],
                 col_wrap=3,
                 sharex=False,
                 aspect=1.3)
g.map(sn.distplot, "val", hist=False, kde_kws={'cumulative': True, 'lw': 3}).add_legend()
feat_subset = X_long['feature'].unique()
axes = g.axes.flatten()
for i, ax in enumerate(axes):
    ax.set_title(feat_subset[i].replace('ch-', ''))
    ax.set_xlabel('')

## Recursive Feature Elimination

In [None]:
from bioimg.singlecell import recursive_elim

In [None]:
y=np.copy(ctrl_annot['Culture'].values)
y[y=='Mono-culture'] = 0
y[y=='Co-culture'] = 1
y = y.astype(int)

In [None]:
X_subset = X_ctrl[featdict['residcor']]

In [None]:
rfe = recursive_elim(df=X_subset, y=y, n_feat=9, elim_step=1)

In [None]:
from bioimg.singlecell import select_features
X_rfe = select_features(df=X_subset, sel=rfe)

In [None]:
colsub = ['Culture'] + X_rfe.columns.values.tolist()
X_long = pd.melt(X_ctrl[colsub], 
                 id_vars=['Culture'], 
                 value_vars=X_rfe.columns.values,
                 var_name='feature', 
                 value_name='val')

In [None]:
sn.set(font_scale=1.3)
sn.set_style('white')
facet_density(X_long, feat_column='feature',
             ncols=3, hue='Culture',
             sharey=False, size=(10,8),
              hue_order=['Mono-culture', 'Co-culture'],
             aspect=1.4)
plt.savefig('../figures/DMSO-allwells-featdist.pdf', bbox_inches='tight')

In [None]:
feat_subset = X_rfe.columns.values

In [None]:
np.random.seed(107)
X_ctrl = []
X_annot = []
pvals = []
for plate in plates:
    platedir = os.path.join('../imgdata', plate)
    ctrl_df, ctrl_annot = load_cells(platedir=platedir, 
                                     wells=dmso_wells,
                                     annot=annot_df, which=2)
    if ctrl_df.shape[0] > 5000:
        ctrl_df, ctrl_annot = subset_ncells(imgdf=ctrl_df,
                                            labels=ctrl_annot, 
                                            n=5000)
    ctrl_df = preprocess_data(df=ctrl_df, sel=sel, glog=True)
    scaler = StandardScaler().fit(X=ctrl_df)
    ctrl_scaled = scale_data(ctrl_df, scaler=scaler)
    ctrl_annot['plate'] = plate
    
    X_subset = ctrl_scaled[feat_subset]
    pvaldf = pd.DataFrame(X_subset.apply(lambda x: mannwhitneyu(x[ctrl_annot.Culture == 'Mono-culture'],
                                     x[ctrl_annot.Culture == 'Co-culture'])[1]), columns=['pval'])
    pvaldf['plate'] = plate
    med_diff = pd.DataFrame(X_subset.apply(lambda x: np.median(x[ctrl_annot.Culture == 'Co-culture']) -\
                          np.median(x[ctrl_annot.Culture == 'Mono-culture'])), columns=['diff_medians'])
    pvaldf = pd.concat([pvaldf, med_diff], axis=1)
    
    X_ctrl.append(ctrl_scaled)
    X_annot.append(ctrl_annot)
    pvals.append(pvaldf)

In [None]:
pvals = pd.concat(pvals)
pvals['padj'] = multipletests(pvals['pval'], alpha=0.01, method='fdr_bh')[1]

In [None]:
#pvals.to_csv('../data/diff_DMSO.csv')