In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from umap import UMAP
import seaborn as sns
import sys
from hdbscan import HDBSCAN
from sklearn.preprocessing import minmax_scale

sys.path.insert(0, '../../')
sys.path.insert(0, '../../cycif/')
from get_data import file2frame
from cycif import *
from common_apis import *
import random
random.seed(50)

In [2]:
# UMAP dimension reduction need be run only once
os.chdir('D:/data')
umap_dist = 'correlation'
umap_nn = 5
umap_md = 0.1
umap_data_fn = ' '.join(['MCF10A commons 533k cells UMAP data v5', 'distfun_', umap_dist, 'NN_', str(umap_nn),'minDist_', str(umap_md),'.csv'])

if not os.path.exists(umap_data_fn):
    pooled_norm_data = pd.DataFrame()
    for file in os.listdir():
        if 'pooled' in file:
            print(file)
            data = pd.read_csv(file,index_col=0)
            data_norm = data-data.quantile(q=0.75)
            pooled_norm_data = pooled_norm_data.append(data_norm)          
    umap = UMAP(n_neighbors = umap_nn, n_components=2, min_dist=umap_md, metric=umap_dist)
    df_pooled_time_umap = umap.fit_transform(pooled_norm_data)
    metadata = pd.read_csv('MCF10A commons metadata.csv',index_col=0)
    df_pooled_time_umap = pd.DataFrame(df_pooled_time_umap,columns = ['X' + str(i) for i in range(df_pooled_time_umap.shape[1])])
    df_pooled_time_umap['time'] = metadata.time.values
    df_pooled_time_umap['Conc'] = metadata.Conc.values
    df_pooled_time_umap['Drug'] = metadata.DrugName.values
    df_pooled_time_umap.to_csv(umap_data_fn)
else:
    df_pooled_time_umap = pd.read_csv(umap_data_fn,index_col=0)

In [None]:
# make umap plot colored by drug name
plt.figure(figsize=(32,18))
df_labels = df_pooled_time_umap.Drug.reset_index()
df_low_dim = df_pooled_time_umap.iloc[:,:2].values
df_pooled_time_umap = df_pooled_time_umap.sort_values('Conc')
for cluster in sorted(df_labels.iloc[:,1].unique()):
    cells_idx = df_labels[df_labels.iloc[:,1]==cluster].index.values
    sizes_by_dose = pd.factorize(df_pooled_time_umap.iloc[cells_idx].Conc)[0]
    sizes_by_dose = 0.25*(sizes_by_dose/sizes_by_dose.max())
    if cluster != 'DMSO':
        plt.scatter(df_low_dim[cells_idx,0],df_low_dim[cells_idx,1],label=cluster, s = sizes_by_dose)
    else:
        plt.scatter(df_low_dim[cells_idx,0],df_low_dim[cells_idx,1],label=cluster, s = 0.25)
plt.legend(markerscale = 50,prop={'size': 32})
plt.savefig('Overall Umap scatterplot sized by factorized doses.png')
plt.close()

In [None]:
# Umap scatter plot colored by marker expression

# pooled_norm_data = pd.DataFrame()
# for file in os.listdir():
#     if 'pooled' in file:
#         print(file)
#         data = pd.read_csv(file,index_col=0)
#         data_norm = data-data.quantile(q=0.75)
#         pooled_norm_data = pooled_norm_data.append(data_norm)

pooled_norm_data = pd.read_csv('533k quantile normed data.csv',index_col=0)
df_pooled_time_umap = pd.read_csv(umap_data_fn,index_col=0)
df_low_dim = df_pooled_time_umap.iloc[:,:2].values
fig,axes = plt.subplots(7,7,figsize = (48,27), sharex=True, sharey=True)
axes = axes.ravel()

for i, col in enumerate (sorted(pooled_norm_data.columns)):
    col_vector = pooled_norm_data[col].values
    col_vector = col_vector-np.median(col_vector)
    col_vector[col_vector<=-5] = -5
    col_vector[col_vector>=5] = 5
    axes[i].scatter(df_low_dim[:,0],df_low_dim[:,1],c=col_vector, s=0.001, cmap = 'coolwarm', label = col)
    axes[i].legend(markerscale=500, frameon=False)
    
plt.savefig('Overall Umap with expr',bbox_inches='tight')
plt.close()

In [None]:
norm = plt.Normalize(0,10)
sm = plt.cm.ScalarMappable(cmap="RdYlGn_r", norm=norm)
sm.set_array([])
sns.set(font_scale=3)
df_pooled_time_umap.sort_values(['Conc','time'],inplace=True)
drugs = df_pooled_time_umap.Drug.unique()
for drug in drugs:
    data = df_pooled_time_umap[df_pooled_time_umap.Drug.isin(['DMSO',drug])]
    data.Drug = drug
    if drug=='DMSO':        
        g = sns.FacetGrid(data, row='Drug',col = 'time',height = 10)
        g = (g.map(plt.scatter, "X0", "X1", s = 0.1))
    else:
        g = sns.FacetGrid(data, row='Drug',col = 'time',hue = 'Conc',palette = 'RdYlGn_r',height = 10)
        g = (g.map(plt.scatter, "X0", "X1", s = 0.1))
    plt.colorbar(sm)
    g.savefig(drug + ' overall fig.png')
    plt.close()