In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from umap import UMAP
import seaborn as sns
import hdbscan
import sys
sys.path.insert(0, '../../')
sys.path.insert(0, '../../cycif/')
from get_data import file2frame
from cycif import *
from common_apis import *

In [None]:
# UMAP dimension reduction need be run only once
pooled_norm_data = pd.DataFrame()
for file in os.listdir():
    if 'pooled' in file:
        print(file)
        data = pd.read_csv(file,index_col=0)
        data_norm = data-data.quantile(q=0.75)
        pooled_norm_data = pooled_norm_data.append(data_norm)

In [None]:
umap = UMAP(n_neighbors = 30,n_components=5)
df_pooled_time_umap = umap.fit_transform(pooled_norm_data)
pd.DataFrame(df_pooled_time_umap).to_csv('D:/MCF10A commons Umap data.csv',index=None)

In [None]:
os.chdir('d:/data')
df_pooled_time_umap = pd.read_csv('MCF10A commons 533k cells UMAP data.csv',index_col=0)
metadata = pd.read_csv('MCF10A commons metadata.csv',index_col=0)
df_pooled_time_umap['time'] = metadata.time.values
df_pooled_time_umap['Conc'] = metadata.Conc.values
df_pooled_time_umap['Drug'] = metadata.DrugName.values
df_pooled_time_umap.columns = ['X','Y','time','Conc','Drug']
plt.scatter(df_pooled_time_umap.X, df_pooled_time_umap.Y, s=0.0001)

In [None]:
from hdbscan import HDBSCAN
cp_list = []
mcs_range = range(10,2000)
for mcs in mcs_range:
    clustering = HDBSCAN(min_cluster_size = mcs,min_samples=15,memory='d:/temp')
    clustering.fit(df_pooled_time_umap.iloc[:,:2])
    cp = np.median(clustering.cluster_persistence_)
    cp_list.append(cp)

In [None]:
from hdbscan import HDBSCAN
cp_list = []
mcs_range = range(10,2000)
for mcs in mcs_range:
    clustering = HDBSCAN(min_cluster_size = mcs,min_samples=15,memory='d:/temp')
    clustering.fit(df_pooled_time_umap.iloc[:,:2])
    cp = np.median(clustering.cluster_persistence_)
    cp_list.append(cp)

In [None]:
clustering = HDBSCAN(min_cluster_size = 619,min_samples=15,memory='d:/temp')
clustering.fit(df_pooled_time_umap.iloc[:,:2])

In [None]:
sns.lineplot(mcs_range,cp_list)

In [None]:
np.unique(clustering.labels_, return_counts=True)

In [None]:
plt.close()
sns.set(font_scale=2)
ce_list = clustering.exemplars_
cluster_id = -1
plt.figure(figsize=(16,9))
df_ce = pd.DataFrame()
for cluster_id,cluster in enumerate(ce_list):
    tmp_df = pd.DataFrame(cluster,columns=['X','Y'])
    tmp_df['cluster'] = 'Cluster' + str(cluster_id)
    df_ce = df_ce.append(tmp_df)
sns.scatterplot(x='X',y='Y',hue='cluster',data=df_ce,legend='full')
plt.legend(bbox_to_anchor=(1, 1.1))

In [None]:
fig = plt.figure()
norm = plt.Normalize(0,10)
sm = plt.cm.ScalarMappable(cmap="RdYlGn_r", norm=norm)
sm.set_array([])
sns.set(font_scale=10)
g = sns.FacetGrid(df_pooled_time_umap, col = 'time',row ="Drug",hue = 'Conc',palette = 'RdYlGn_r',height = 36)
g = (g.map(plt.scatter, "X", "Y"))
cbar_ax = fig.add_axes([0.85, 0.15, 5, 0.2])
cb = fig.colorbar(sm,cax=cbar_ax,orientation='horizontal')
cb.set_label(label='Dose',weight='bold',size = 32)
cb.ax.tick_params(labelsize=32)
plt.savefig('overall fig.png')