# Exploring mean well profiles of Cell Painting Experiment

In [None]:
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import re
import os
import sys
import h5py
import json
from sklearn.manifold import TSNE

javabridge.start_vm(class_path=bf.JARS)

In [None]:
datadir = '../data/Bray-wellprofiles/'

## Normalized Well Profiles with Selected Features

In [None]:
# mean well profiles
meanprof = [f for f in os.listdir(datadir) if 'variable_selected' in f]

Load the first 10 plates and check feature correlation in this data set. The values in these files (`plate_normalized_variable_selected`) are already normalized with respect to DMSO wells and feature selection has already been performed

In [None]:
imgdata = []
annot = []
for prof in meanprof[:10]:
    plate_df = pd.read_csv(os.path.join(datadir, prof))
    annot_df = plate_df.loc[:,plate_df.columns.str.contains('Metadata')]
    imgdf = plate_df.loc[:,~plate_df.columns.str.contains('Metadata')]
    imgdata.append(imgdf)
    annot.append(annot_df)

In [None]:
imgdata = pd.concat(imgdata)

In [None]:
annot = pd.concat(annot)

In [None]:
imgdata = imgdata.dropna(axis=1)

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(imgdata.T),
                       index=imgdata.columns,
                       columns=imgdata.columns )

In [None]:
from bioimg.singlecell import plot_heatmap
plot_heatmap(featcor)

In [None]:
plot_heatmap(imgdata,
            xticklabels=False, size=(10,14),
            yticklabels=False, vmin=-5, vmax=5)

# Unnormalized Well Profiles
Load well profiles with raw features and perform normalization and feature selection:

In [None]:
chemannot = pd.read_csv("../data/Bray-metadata/plate_annot.txt", sep='\t')

In [None]:
chemannot.head(6)

In [None]:
meanprof = [f for f in os.listdir(datadir) if len(re.findall('[0-9]+\.csv', f))]

In [None]:
plate_df = pd.read_csv(os.path.join(datadir, meanprof[0]))
annot_df = plate_df.loc[:,plate_df.columns.str.contains('Metadata')]
imgdf = plate_df.loc[:,~plate_df.columns.str.contains('Metadata')]

In [None]:
annot_df = annot_df.rename(columns={'Image_Metadata_Plate': 'plateID',
                        'Image_Metadata_Well': 'well_position'})

In [None]:
annot_df = pd.merge(left=annot_df, right=chemannot, 
         on=['plateID', 'well_position'],
        how='left')

In [None]:
ctrl_df = imgdf[annot_df['broad_sample'] == 'DMSO']

In [None]:
from bioimg.singlecell import select_features
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1e-6).fit(ctrl_df)

Remove features with variance < 1e-6 and log-transform the data:

In [None]:
from bioimg.singlecell import preprocess_data
ctrl_df = preprocess_data(df=ctrl_df, sel=sel, glog=True)

Scale the data with respect to DMSO controls:

In [None]:
from sklearn.preprocessing import StandardScaler
from bioimg.singlecell import scale_data, check_data
scaler = StandardScaler().fit(ctrl_df)
ctrl_scaled = scale_data(ctrl_df, scaler=scaler).reset_index(drop=True)

In [None]:
drugdf = imgdf[annot_df['broad_sample'] != 'DMSO']

In [None]:
drugdf = preprocess_data(df=drugdf, sel=sel, glog=True)
# center and scale by control wells
drugdf = scale_data(drugdf, scaler=scaler)

Load all unnormalized morphological features, remove features with variance < 1e-6, log-transform the data. Center and scale the data with respect to DMSO controls:

In [None]:
imgdata = []
annot = []
for prof in meanprof:
    plate_df = pd.read_csv(os.path.join(datadir, prof))
    annot_df = plate_df.loc[:,plate_df.columns.str.contains('Metadata')]
    imgdf = plate_df.loc[:,~plate_df.columns.str.contains('Metadata')]
    annot_df = annot_df.rename(columns={'Image_Metadata_Plate': 'plateID',
                        'Image_Metadata_Well': 'well_position'})
    annot_df = pd.merge(left=annot_df, right=chemannot, 
         on=['plateID', 'well_position'],
        how='left')
    
    sel = VarianceThreshold(threshold=1e-8).fit(imgdf)
    imgdf = preprocess_data(df=imgdf, sel=sel, glog=True)
    scaler = StandardScaler().fit(imgdf[annot_df['broad_sample']=='DMSO'])
    imgdf_scaled = scale_data(imgdf, scaler=scaler)
    
    imgdata.append(imgdf_scaled)
    annot.append(annot_df)

imgdata = pd.concat(imgdata)
annot = pd.concat(annot)

In [None]:
imgdata = imgdata.dropna(axis=1)

In [None]:
imgdata.shape

In [None]:
annot.shape

In [None]:
annot['CPD_NAME_TYPE'].value_counts()

Compute feature correlation matrix:

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(imgdata.T),
                       index=imgdata.columns,
                       columns=imgdata.columns )

In [None]:
plot_heatmap(featcor)

In [None]:
drug_df = annot[annot['ASSAY_WELL_ROLE'] == 'treated']

Get the replicate pair map, a table that indicates which (plate - well) tuples are replicate measurements based on screen metadata:

In [None]:
# replicate map
repl_map = pd.merge(drug_df, drug_df, on=['ASSAY_WELL_ROLE',
                           'broad_sample',
                           'mmoles_per_liter',
                           'CPD_NAME',
                           'CPD_NAME_TYPE',
                           'CPD_SMILES'], how='inner')

In [None]:
repl_map = repl_map[repl_map['plateID_x'] != repl_map['plateID_y']]

In [None]:
repl_map['repID'] = ['_'.join(sorted(tup)) for tup in zip(repl_map['plateID_x'].astype(str), repl_map['plateID_y'].astype(str))]

In [None]:
repl_map = repl_map.drop_duplicates(subset=['repID', 'CPD_NAME', 'mmoles_per_liter']).reset_index(drop=True)

In [None]:
rep_one = repl_map[['plateID_x', 'well_position_x']]

In [None]:
rep_two = repl_map[['plateID_y', 'well_position_y']]

In [None]:
imgdata = pd.concat([imgdata, annot[['plateID', 'well_position']]], axis=1)

In [None]:
rep_one = rep_one.rename(columns={'plateID_x': 'plateID',
                        'well_position_x': 'well_position'})
rep_one = pd.merge(left=rep_one, right=imgdata, how='left')

In [None]:
rep_two = rep_two.rename(columns={'plateID_y': 'plateID',
                        'well_position_y': 'well_position'})
rep_two = pd.merge(left=rep_two, right=imgdata, how='left')

Remove replicate pairs that differ in cell count by more than the 75th percentile of absolute difference between replicate cell counts:

In [None]:
thresh = np.percentile((rep_one['Cells_Number_Object_Number'] - rep_two['Cells_Number_Object_Number']).abs().values, 75)

In [None]:
good_repl = ((rep_one['Cells_Number_Object_Number'] - rep_two['Cells_Number_Object_Number']).abs() < thresh)
# bad replicates
bad_repl = ~good_repl
badrep_one = rep_one.loc[bad_repl,['plateID', 'well_position']]
badrep_two = rep_two.loc[bad_repl,['plateID', 'well_position']]

rep_one = rep_one.loc[good_repl,:]
rep_two = rep_two.loc[good_repl,:]

In [None]:
repcor = rep_one.drop(columns=['plateID', 'well_position']).corrwith(rep_two.drop(columns=['plateID', 'well_position']))

In [None]:
sel_feats = repcor[repcor > 0.5].index.values

In [None]:
# create a dictionary with various selected feature lists
featdict = dict()
featdict['repcor'] = sel_feats

In [None]:
X_subset = imgdata[sel_feats]

In [None]:
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor)

In [None]:
feat_plot = 'Cells_AreaShape_Eccentricity'
scat_df = pd.concat([rep_one[[feat_plot]],
          rep_two[[feat_plot]]], axis=1)
scat_df.columns = [feat_plot + '_x', feat_plot +'_y']


fig, ax = plt.subplots(figsize=(6,6))
#plt.plot([-2, 0.5], [-2, 0.5], linewidth=1.5, linestyle='--', color='black')
sn.scatterplot(x=feat_plot + '_x', y=feat_plot + '_y',
               data=scat_df, ax=ax, alpha=0.5)

sn.despine()
ax.annotate("r = {:.2f}".format(repcor[feat_plot]),
                xy=(.1, .9), xycoords=ax.transAxes)
#ax.set_xlim((-2,0.5))
#ax.set_ylim((-2,0.5))

plt.xlabel(feat_plot + ' (replicate 1)')
plt.ylabel(feat_plot + ' (replicate 2)')
xticks = ax.xaxis.get_major_ticks() 
xticks[0].label1.set_visible(False)
yticks = ax.yaxis.get_major_ticks() 
yticks[0].label1.set_visible(False)
sn.set(font_scale=1.4, style='white')

In [None]:
prof_rep1 = rep_one[sel_feats].reset_index(drop=True)
prof_rep2 = rep_two[sel_feats].reset_index(drop=True)

In [None]:
# subset to 2000 replicate pairs to speed up computations
np.random.seed(607)
subset = np.random.choice(prof_rep1.index, size=2000)

Run variable selection:

In [None]:
from bioimg.singlecell import select_residcor
sel_feats = select_residcor(prof1=prof_rep1.loc[subset,:], prof2=prof_rep2.loc[subset,:],
                            sel = ['Cells_AreaShape_Eccentricity', 
                                   'Cells_Intensity_MeanIntensity_Mito'])

In [None]:
print("Number of selected features: %d" % len(sel_feats))

In [None]:
sel_feats.sort()
featdict['residcor'] = sel_feats

In [None]:
featdict = {k : v if type(v)==list else v.tolist() for k,v in featdict.items()}

with open('featselect.json', 'w') as fp:
    json.dump(featdict, fp)

In [None]:
X_subset = imgdata[sel_feats]
# feature correlation
featcor = pd.DataFrame(np.corrcoef(X_subset.T),
                       index=X_subset.columns,
                       columns=X_subset.columns )
plot_heatmap(featcor, xticklabels=5, size=(12,14),
            vmax=1, vmin=-1)

Check out replicates with large absolute deviation in terms of cell count:

In [None]:
badrep_one.head()

In [None]:
badrep_one = badrep_one.groupby(['plateID', 'well_position'], as_index=False).size().reset_index(name='count')

In [None]:
badrep_two = badrep_two.groupby(['plateID', 'well_position'], as_index=False).size().reset_index(name='count')

In [None]:
badwells = (pd.concat([badrep_one[badrep_one['count'] > 1],
          badrep_two[badrep_two['count'] > 1]]).
drop_duplicates(['plateID', 'well_position']))

In [None]:
#badwells.to_csv('../data/Bray-noisywells.csv')

Here we observe an edge effect: noisy wells (when cell counts are compared across the replicates) are mostly localized on the edge (close to columns 1-3 and the second peak at the right edge, columns 22-24)

In [None]:
plt.hist([int(re.findall('[0-9]+', s)[0]) for s in badwells['well_position'].value_counts().index[:100].values])
sn.despine()
plt.xlabel('Plate column')
plt.title('Top noisy well distribution')
plt.xlim((1,24))
plt.show()

Remove wells with large deviations in cell count across multiple replicates:

In [None]:
imgfilt = pd.merge(left=imgdata[['plateID', 'well_position'] + featdict['residcor'] ],
         right=badwells[['plateID', 'well_position']],
        how='outer', indicator=True)

In [None]:
imgfilt = imgfilt[imgfilt['_merge'] == 'left_only'].reset_index(drop=True)

Subset only to drugs with common names:

In [None]:
annot_common = annot[np.isin(annot['CPD_NAME_TYPE'], ['common', 'primary-common'])].reset_index(drop=True)
annot_common = annot_common[['plateID', 'well_position', 'CPD_NAME', 'mmoles_per_liter']]

In [None]:
imgfilt = pd.merge(imgfilt, annot_common)

In [None]:
# aggregated compound profiles
comp_prof = imgfilt.groupby(['CPD_NAME', 'mmoles_per_liter'], as_index=True).agg('mean')

In [None]:
comp_prof = comp_prof[featdict['residcor']]

In [None]:
'''
thresh = np.percentile(comp_prof.abs().sum(axis=1), 50)
comp_prof = comp_prof[comp_prof.abs().sum(axis=1) > thresh]
'''

In [None]:
#comp_prof.to_csv('../data/drugprofiles_Bray.csv')

In [None]:
thresh = np.percentile(comp_prof.abs().sum(axis=1), 90)
comp_prof = comp_prof[comp_prof.abs().sum(axis=1) > thresh]

In [None]:
plot_heatmap(comp_prof, xticklabels=False,
             yticklabels=True,
             size=(12,20), vmin=-10, vmax=10)