In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from umap import UMAP
import seaborn as sns
import sys
from hdbscan import HDBSCAN

sys.path.insert(0, '../../')
sys.path.insert(0, '../../cycif/')
from get_data import file2frame
from cycif import *
from common_apis import *
import random
random.seed(50)

In [2]:
# UMAP dimension reduction need be run only once
os.chdir('D:/data')
umap_dist = 'correlation'
umap_nn = 5
umap_md = 0.1
umap_data_fn = ' '.join(['MCF10A commons 533k cells UMAP data v5', 'distfun_', umap_dist, 'NN_', str(umap_nn),'minDist_', str(umap_md),'.csv'])
df_pooled_time_umap = pd.read_csv(umap_data_fn,index_col=0)
df_pooled = pd.read_csv('MCF10A 533k quantile normed data.csv',index_col=0)

In [7]:
clustering = HDBSCAN(min_cluster_size = 491,min_samples=15,memory='d:/temp')
clustering.fit(df_pooled)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory='d:/temp',
    metric='euclidean', min_cluster_size=491, min_samples=15, p=None,
    prediction_data=False)

In [None]:
# Two step HDBSCAN clustering to generate cluster information on the dataset
# Firt step: overal clustering
# Second step: break the largest cluster, which is often the DMSO cluter, into small clusters.
clustering = HDBSCAN(min_cluster_size = 491,min_samples=15,memory='d:/temp')
clustering.fit(df_pooled_time_umap.iloc[:,:2])
cluster_counts = np.unique(clustering.labels_, return_counts=True)[1]
DMSO_cluster_id = cluster_counts.argmax()-1
idx_DMSO_cluster = [i for i, x in enumerate(clustering.labels_) if x == DMSO_cluster_id]
print(np.unique(clustering.labels_, return_counts=True))
subclustering = HDBSCAN(min_cluster_size = 500)
subclustering.fit(df_pooled_time_umap.iloc[idx_DMSO_cluster,:2])
print(np.unique(subclustering.labels_, return_counts=True))
df_labels = pd.Series(['Overall ' + str(x) for x in clustering.labels_])
df_labels[idx_DMSO_cluster] = ['DMSO_' + str(x) for x in subclustering.labels_]
df_pooled_time_umap['cluster'] = df_labels.values
df_pooled_time_umap.to_csv(umap_data_fn)

In [None]:
# Making cluster plot of first iteration
plt.figure(figsize=(32,18))
labels = pd.Series(clustering.labels_)
df_low_dim = df_pooled_time_umap.iloc[:,:2].values
for cluster in np.unique(labels)[1:]:
    cells_idx = labels[labels==cluster].index.values
    print('Processing cluster: {} with {} cells'.format(str(cluster),str(len(cells_idx))))
    plt.scatter(df_low_dim[cells_idx,0],df_low_dim[cells_idx,1],label=cluster, s = 0.25)
plt.legend(markerscale = 50,prop={'size': 32}, bbox_to_anchor = (1.1,1))
plt.savefig('Overall Umap scatterplot clustered one iteration.png')
plt.close()

# Make subclustering on the DMSO central cluster
plt.figure(figsize=(32,18))
df_low_dim = df_pooled_time_umap.iloc[:,:2].values
for cluster in sorted(np.unique(df_labels)[1:]):
    cells_idx = df_labels[df_labels==cluster].index.values
#     print('Processing cluster: {} with {} cells'.format(str(cluster),str(len(cells_idx))))
    plt.scatter(df_low_dim[cells_idx,0],df_low_dim[cells_idx,1],label=cluster, s = 0.25)
plt.legend(markerscale = 50,prop={'size': 24}, ncol=8, bbox_to_anchor = (1.02,0))
plt.savefig('Overall Umap scatterplot clutered two iterations.png',bbox_inches='tight')
plt.close()

In [None]:
from scipy.stats import hypergeom

df_pooled_time_umap['condition'] = df_pooled_time_umap.Drug + '_' + df_pooled_time_umap.Conc.astype(str) + '_' + df_pooled_time_umap.time
df_freq = df_pooled_time_umap.groupby(['cluster','condition']).count().iloc[:,0]
cluster_siezs = df_freq.groupby('cluster').sum()
condition_sizes = df_pooled_time_umap.groupby(['condition']).count().iloc[:,0]

df_cluster_distribution_pval = pd.DataFrame(index = cluster_siezs.index, columns=condition_sizes.index)
for cluster in cluster_siezs.index:
    N = cluster_siezs[cluster]
    k_vector = df_freq[cluster]-1
    n_vector = condition_sizes.loc[k_vector.index]
    pval_vector = hypergeom.sf(k_vector,len(df_pooled_time_umap),n_vector,N)
    log_pval_vector = -np.log10(pval_vector)
    # suppress super low pvalues
    log_pval_vector[log_pval_vector>=400] = 318
    df_cluster_distribution_pval.loc[cluster, k_vector.index] = log_pval_vector

df_cluster_distribution_pval.fillna(0,inplace=True)    
df_cluster_distribution_pval = abs(df_cluster_distribution_pval).transpose()
df_cluster_distribution_pval = np.log10(df_cluster_distribution_pval+1)
# sinificant threshold at 0.05 is now 0.36
for i,col in enumerate(['Time','Dose','Drug']):
    df_cluster_distribution_pval.insert(0,col,[x.split('_')[2-i] for x in df_cluster_distribution_pval.index])
df_cluster_distribution_pval.to_csv('Cluster distribution hypergeom pval.csv')

In [None]:
# Test for best parameters in UMAP optional
# Define sub_df from all the data first
random_idx = np.random.choice(df_pooled.index,10000, False)
sub_df = df_pooled.loc[random_idx].iloc[:,:2]
list_nn = [5,15,25,35,45,55,65,75,85]
lsit_md = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
_,axes = plt.subplots(ncols=3, nrows=3,figsize = (32,18))
axes = axes.ravel()
idx = 0
for n_neighbors in list_nn:
    umap = UMAP(n_neighbors = n_neighbors,n_components=2,min_dist=0.1, metric='correlation', random_state=50)
    df_pooled_time_umap = umap.fit_transform(sub_df)
    metadata = pd.read_csv('MCF10A commons metadata.csv',index_col=0)
    df_pooled_time_umap = pd.DataFrame(df_pooled_time_umap,columns = ['X' + str(i) for i in range(df_pooled_time_umap.shape[1])])
    axes[idx].scatter(df_pooled_time_umap.X0, df_pooled_time_umap.X1, s=0.01)
    axes[idx].set_title('n_neighbors: {}'.format(str(n_neighbors)))
    idx+=1

_,axes = plt.subplots(ncols=3, nrows=3,figsize = (32,18))
axes = axes.ravel()
idx = 0    
for md in lsit_md:
    umap = UMAP(n_neighbors = 5,n_components=2,min_dist=md, metric='correlation',random_state=50)
    df_pooled_time_umap = umap.fit_transform(sub_df)
    metadata = pd.read_csv('MCF10A commons metadata.csv',index_col=0)
    df_pooled_time_umap = pd.DataFrame(df_pooled_time_umap,columns = ['X' + str(i) for i in range(df_pooled_time_umap.shape[1])])
    axes[idx].scatter(df_pooled_time_umap.X0, df_pooled_time_umap.X1, s=0.01)
    axes[idx].set_title('min_dist: {}'.format(str(md)))
    idx+=1

# # HDBSCAN peremeter optimization, optional
# cp_list = []
# mcs_range = range(10,2000)
# for mcs in mcs_range:
#     clustering = HDBSCAN(min_cluster_size = mcs,min_samples=15,memory='d:/temp')
#     clustering.fit(df_pooled_time_umap.iloc[:,:2])
#     cp = np.median(clustering.cluster_persistence_)
#     cp_list.append(cp)
    
# sns.lineplot(mcs_range,cp_list)
# min_cluster_size = cp_list.index(max(cp_list))
# print('minimal cluster size based on maximal cluster persistence: {}'.format(str(min_cluster_size)))