In [1]:
import pandas as pd
import numpy as np
import os
import synapseclient as syn
import synapseutils
from scipy.stats import ttest_ind as ttest
from scipy.stats import ks_2samp
from statsmodels.stats.multitest import fdrcorrection as fdr
from matplotlib import pyplot as plt
from umap import UMAP
from sklearn.cluster import DBSCAN
import hdbscan
import sys
sys.path.insert(0, '../../')
sys.path.insert(0, '../../cycif/')
from get_data import file2frame
from cycif import *
from common_apis import *
import random
random.seed(50)

In [2]:
syn = syn.Synapse()
syn.login()
files = synapseutils.syncFromSynapse(syn, 'syn14734328',path='/data')

Welcome, Yunguan Wang!



In [None]:
# Getting Differential analysis without clustering
os.chdir('D:/data/')
Report = pd.DataFrame()
for time in ['24h','48h','72h']:
    for fn in files:
        if time in fn.path:
            print(fn.path)
            data = pd.read_csv(fn.path)
            invalid_cols = ['DAPI0002', 'DAPI0003', 'DAPI0004', 'DAPI0005', 'DAPI0006', 'DAPI0007']
            data = data.drop(invalid_cols,axis = 1)
            data.index = ['Cell_'+str(x) for x in data.index]
            metadata = data.iloc[:,:4]
            data.columns = [fn.path.split('_')[-1][:-4]+ '_' +x if x not in ['well','DrugName','HMSLid','Conc'] else x for x in data.columns ]
            if 'pooled_data' not in globals():
                pooled_data = data
            else:
                pooled_data = pd.concat([pooled_data, data.iloc[:,4:]],axis = 1)
    n_control_wells = data[data.DrugName=='DMSO'].well.unique().shape[0]            
    pooled_data, invalid_cells = preprocessing_log_transform(pooled_data)
    metadata.drop(invalid_cells,inplace=True)

    # Get control samples that are from the biggest cluster
    controls_cells = metadata[metadata.DrugName=='DMSO'].index
    control_norm = pooled_data.loc[controls_cells]
#     control_norm_umap = umap.fit_transform(control_norm)
#     labels = pd.Series(clustering_function.fit_predict(control_norm_umap),index = controls_cells)
#     valid_cluster = labels.value_counts().index[0]
#     valid_controls = labels[labels==valid_cluster].index
#     control_norm = control_norm.loc[valid_controls]
#     metadata.loc[labels.index,'Cluster'] = labels.values
#     print('From total {} DMSO cells, {} were selected from the biggest cluster'.format(str(len(controls_cells)),str(len(valid_controls))))

    # Get markers for each treated condition
    metadata['condition'] = metadata.DrugName + '_' + metadata.Conc.round(6).astype(str)
    for condition in metadata.condition.unique():
        test_cells = metadata[metadata.condition==condition].index
        test_dose = condition.split('_')[1]
        test_drug = condition.split('_')[0]
        test = pooled_data.loc[test_cells]
        fig_name = '_'.join([time, test_drug,str(test_dose),'.png'])
        
        print('Processing: ', time, condition )
        DE_report = differential_analysis(test,control_norm)
        DE_report.loc['_CellNumber', 'logFC'] = np.log2(n_control_wells*len(test)/len(control_norm))
        DE_report['Dose'] = test_dose
        DE_report['DrugName'] = test_drug
        DE_report['Abs_logFC'] = abs(DE_report.logFC)
        DE_report['Time'] = time
        DE_report['Cluster'] = 'Whole well'
        
        Report = Report.append(DE_report)
        
    # get rid of old pooled_data
    del pooled_data
    
Report.to_excel('MCF10A commons report.xlsx')

In [12]:
# Make distribution plot per channel for each dose of drug
# df_logFC = pd.read_excel('MCF10A commons report.xlsx',index_col=0)
# df_logFC = df_logFC[df_logFC.Abs_logFC>=0.6]
# df_logFC['channel_time'] = df_logFC.index + ':' + df_logFC.Time
# df_logFC['drug_channel_time'] = df_logFC.DrugName + df_logFC.index + ':' + df_logFC.Time

# overall_data = overall_data.reset_index()
# overall_data = overall_data.melt('index',var_name='Channel',  value_name='Log2 Cycif intensity')
# overall_data.set_index('index',inplace=True)
# overall_data = overall_data.merge(metadata[['DrugName', 'Conc', 'time']], right_index=True, left_index=True, how='left')
# overall_data.sort_values('Conc',inplace=True)
# overall_data['channel_time'] = overall_data.Channel + ':' + overall_data.time
# overall_data['drug_channel_time'] = overall_data.DrugName+overall_data.Channel + ':' + overall_data.time
# overall_data = overall_data[overall_data.drug_channel_time.isin(df_logFC.drug_channel_time)]
# overall_data.drop(['drug_channel_time','time'],axis = 1, inplace = True)
# del df_logFC
# overall_data.sort_values('channel_time',inplace = True)

for test_drug in metadata.DrugName.unique():
    if test_drug == 'DMSO':
        continue
    df_test = overall_data[overall_data.DrugName.isin([test_drug,'DMSO'])]
    sns.set(font_scale=1)
    g = sns.FacetGrid(df_test,col='channel_time', hue="Conc", palette="RdYlGn_r",sharey = False ,sharex=False,height = 5,col_wrap=9)
    g = (g.map(sns.distplot, "Log2 Cycif intensity", hist=False, rug=False))
    g.add_legend()
    plt.savefig(test_drug + ' abs log FC 0.6 Distribution vs DMSO.png')
    plt.close()

In [None]:
# Clustering based per well analysis
os.chdir('D:/data/')
Report = pd.DataFrame()
for time in ['24h','48h','72h']:
    for fn in files:
        if time in fn.path:
            print(fn.path)
            data = pd.read_csv(fn.path)
            invalid_cols = ['DAPI0002', 'DAPI0003', 'DAPI0004', 'DAPI0005', 'DAPI0006', 'DAPI0007']
            data = data.drop(invalid_cols,axis = 1)
            data.index = ['Cell_'+str(x) for x in data.index]
            metadata = data.iloc[:,:4]
            data.columns = [fn.path.split('_')[-1][:-4]+ '_' +x if x not in ['well','DrugName','HMSLid','Conc'] else x for x in data.columns ]
            if 'pooled_data' not in globals():
                pooled_data = data
            else:
                pooled_data = pd.concat([pooled_data, data.iloc[:,4:]],axis = 1)
            
    pooled_data, invalid_cells = preprocessing_log_transform(pooled_data)
    metadata.drop(invalid_cells,inplace=True)

    # Get control samples that are from the biggest cluster
    controls_cells = metadata[metadata.DrugName=='DMSO'].index
    control_norm = pooled_data.loc[controls_cells]
#     control_norm_umap = umap.fit_transform(control_norm)
#     labels = pd.Series(clustering_function.fit_predict(control_norm_umap),index = controls_cells)
#     valid_cluster = labels.value_counts().index[0]
#     valid_controls = labels[labels==valid_cluster].index
#     control_norm = control_norm.loc[valid_controls]
#     metadata.loc[labels.index,'Cluster'] = labels.values
#     print('From total {} DMSO cells, {} were selected from the biggest cluster'.format(str(len(controls_cells)),str(len(valid_controls))))
    # Get markers for each treated condition
    metadata['condition'] = metadata.DrugName + '_' + metadata.Conc.round(3).astype(str)
    for condition in metadata.condition.unique():
        test_cells = metadata[metadata.condition==condition].index
        test_dose = condition.split('_')[1]
        test_drug = condition.split('_')[0]
        test = pooled_data.loc[test_cells]
        fig_name = '_'.join([time, test_drug,str(test_dose),'.png'])
        
        print('Processing: ', time, condition )
        DE_report, labels = cluster_based_DE(test,control_norm,fig_name)
        # If no DE genes were found, continue
        if len(DE_report) == 0:
            continue
        
        # Write DE report
        DE_report['Dose'] = test_dose
        DE_report['DrugName'] = test_drug
        DE_report['Abs_logFC'] = abs(DE_report.logFC)
        DE_report['Time'] = time
        
        # writes clustered cell assignment to file
        metadata.loc[labels.index,'Cluster'] = labels.values
        
        # get top marks per each cluster of each condition if the absolute log2FC is >= 0.6
        best_markers = DE_report[DE_report.Abs_logFC>=0.6].index.unique()
        Report = Report.append(DE_report)

        # making plots
        test_plus_norm = test.append(control_norm)
        labels = labels.append(pd.Series('control', index = control_norm.index))
        test_plus_norm_umap = umap.fit_transform(test_plus_norm)
        plot_expr_on_2D(test_plus_norm_umap, test_plus_norm[best_markers],'Expr '+ fig_name, labels)
        
    # get rid of old pooled_data
    del pooled_data
    metadata.to_csv(time + ' MCF10A dataset metadata.csv')
Report.to_excel('MCF10A commons Per well report.xlsx')