## Non-Negative Matrix Factorization (NMF)
Performs NMF on slide-seq data to assign beads to cell types based on single-cell data. <br>
7/28/19

In [None]:
#packages
import argparse 
import pandas as pd
import numpy as np
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
from sklearn.preprocessing import StandardScaler
from IPython.display import display
import scipy.optimize
import scipy.stats
import os
from sklearn.preprocessing import StandardScaler
import matplotlib.patches as mpatches
from sklearn.decomposition import NMF
from tqdm import tqdm
from matplotlib import colors
import collections

In [None]:
#plot settings
%pylab inline
rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams['figure.facecolor']='white'

In [None]:
#Block creates parser to interpret command line info and make arguments into variables. 
parser = argparse.ArgumentParser(description = "handle inputs from sam matlab script to run NMFreg on puck data")
parser.add_argument("-da", type=str,
                   help = "Pass the data path for the atlas here")
parser.add_argument("-dp", type=str,
                   help = "Pass the data path for the puck here")
parser.add_argument("-t", type=str,
                   help = "string of tissue type as is in data reference directory eg hippocampus, cerebellum, etc")
parser.add_argument("-c", type=int,
                   help = "cutoff for the UMI filtering of the DGE for the puck")
parser.add_argument("-dge", type=str,
                   help = "name of the dge file. No extension assumes csv")
parser.add_argument("-bl", type=str,
                   help = "name of the bead location file passed here. No extension assumes csv")
parser.add_argument("-s", type=str,
                   help = "health status of tissue (WT/DKD)")
parser.add_argument("-pkn", type=str,
                   help = "puck number")

print(parser)

### In the block below, you may change the following for your data:
-da   : to file path for your single cell data <br>
 -dp  : to file path for your puck data<br>
 -t   : to the type of tissue you are observing<br>
 -c   : to your preferred UMI cutoff value<br>
 -dge : to the file path for your DGE file<br>
 -bl  : to the file path for your bead location file<br>
 -s   : to the status of the tissue you are working with (for kidney, DKD or WT)<br>
 -pkn : to the puck number

In [None]:
#Block utilizes parser
args = parser.parse_args('-da /broad/macosko/bstickels/data/slideseq/slideseq/NMFreg -dp /broad/thechenlab/Jamie/Kidney/Puck_181206_3 -t kidney_ob -c 5 -dge MappedDGEForR -bl BeadLocationsForR -s DKD -pkn Puck_181206_3'.split())

### In the block below, you may change the following for your own data:
NMFReg_output: to your filepath for plot ouput<br>
Perm_test_output: to your filepath for permutation test data<br>
Interactive_plot_output: to your filepath for creating interactive plot 

In [None]:
#Initialize file path for output of data
NMFreg_output = "/broad/thechenlab/breanna/NMFReg_output/"
if not os.path.exists(NMFreg_output):
   os.makedirs(NMFreg_output)
Perm_test_output = "/broad/thechenlab/breanna/permutation_test_data/"
if not os.path.exists(Perm_test_output):
   os.makedirs(Perm_test_output)
Interactive_plot_output = "/broad/thechenlab/breanna/interactive_plot_data/"
if not os.path.exists(Interactive_plot_output):
   os.makedirs(Interactive_plot_output)

In [None]:
#Block formalizes variables from parser info

# Create variables from parser input
#atlas data_path
data_path = args.da

#puck data path
data_path_puck = args.dp

tissue_name = args.t

#UMI threshold for the cutoff
UMI_threshold = args.c

puck_dge_name = args.dge

bead_locations = args.bl
tissue_data_path = "{}/{}".format(data_path,tissue_name) 

tissue_status=args.s

pkn=args.pkn

#if not os.path.exists(tissue_data_path):
#	os.makedirs(tissue_data_path)
print(bead_locations)
print(puck_dge_name)
print(data_path)
print(data_path_puck)
print(tissue_name)
print(UMI_threshold)
print(tissue_data_path)
print(sys.version)

In [None]:
#Function for saving figures
def save_result(name):
    plt.savefig("{}plots/{}.eps".format(NMFreg_output, name),
                bbox_inches='tight', transparent=True, dpi=1000)


In [None]:
#Read in count and coordinate data
dge_path = "{}/{}.csv".format(data_path_puck,puck_dge_name)
dge = pd.read_csv(dge_path, header = 0, index_col = 0)
dge = dge.T
dge = dge.reset_index()
dge = dge.rename(columns={'index':'barcode'})

In [None]:
#Save in the locations of each bead
coords = pd.read_csv("{}/{}.csv".format(data_path_puck,bead_locations), header = 0)
coords = coords.rename(columns={'Barcodes':'barcode'})
coords = coords.rename(columns={'barcodes':'barcode'})
df_merged = dge.merge(coords, right_on='barcode', left_on='barcode')
counts = df_merged.drop(['xcoord', 'ycoord'], axis=1)

In [None]:
#Read in atlas (single-cell) data 
atlas_dge = pd.read_csv("{}/dge_hvgs.csv".format(tissue_data_path), index_col = 0, header = 0)
atlas_dge = atlas_dge.T 
cell_clusters = pd.read_csv("{}/cell_cluster_outcome.csv".format(tissue_data_path))


In [None]:
#Intersect gene lists for atlas and puck data sets, create new dataframe with only those genes
atlas_genes = atlas_dge.columns.tolist()
puck_genes = counts.columns.tolist()[1:]#1 to skip first column (barcode)
gene_intersection = list(set(atlas_genes) & set(puck_genes))#return a list of only genes that appear in both atlas and puck genes
atlasdge = atlas_dge[gene_intersection] #filters out gene expressions if not present in both slide-seq and atlas data

puckcounts = counts[['barcode'] + gene_intersection]
puckcounts = puckcounts.set_index(counts['barcode'])
puckcounts = puckcounts.drop('barcode', axis=1)

In [None]:
#Plots spatial loction of beads and number of gene expressions per bead
sample_info = df_merged[['barcode','xcoord', 'ycoord']]
UMIspergene = np.sum(puckcounts, axis=0)#compute the number of UMIs per gene
gr0_UMIspergene = UMIspergene[0:][UMIspergene[0:] > 0]#filters genes with 0 UMIs
counts_gr0 = counts[['barcode'] + gr0_UMIspergene.index.tolist()]
counts_gr0_barcodestotals = np.sum(counts_gr0.drop('barcode', axis=1), axis=1)
sample_info['total_counts'] = counts_gr0_barcodestotals
sample_info_grthreshold = sample_info.loc[sample_info['total_counts'] > UMI_threshold]
coords = sample_info_grthreshold
coords = coords.reset_index(drop=True) #reset indices after filtering
df_merged_grthreshold = counts_gr0.merge(sample_info_grthreshold, right_on='barcode', left_on='barcode')
counts_gr0_grthreshold = df_merged_grthreshold.drop(['xcoord', 'ycoord', 'total_counts'], axis=1)
counts = counts_gr0_grthreshold
df_merged_grthreshold = []
counts_gr0 = []
counts_gr0_grthreshold = []

In [None]:
#Plot count/coordinate data

#Plot location of data
plt.figure(figsize = (10, 10))
plt.scatter(coords['xcoord'], coords['ycoord'], c='k', s=4, alpha=0.6);
plt.axis('equal');
plt.show()
#save_result("filtered_tissue_coverage")
plt.close()

#Plot number of genes expressed per bead
plt.figure(figsize = (12, 12))
plt.set_cmap('viridis_r')
plt.scatter(coords['xcoord'], coords['ycoord'], c=coords['total_counts'], s=4, alpha=0.6);
plt.axis('equal');
plt.colorbar();
plt.show()
#save_result("bead_counts")
plt.close()

In [None]:
#Intersect gene lists for atlas and puck data sets after filter 2 blocks above 
atlas_genes = atlas_dge.columns.tolist()
puck_genes = counts.columns.tolist()[1:]#1 to skip first column (barcode)
gene_intersection = list(set(atlas_genes) & set(puck_genes))#return a list of only genes that appear in both atlas and puck genes
atlasdge = atlas_dge[gene_intersection]#filters out gene expressions if not present in both slide-seq and atlas data

pcounts = counts[['barcode'] + gene_intersection]
pcounts = pcounts.set_index(counts['barcode'])
pcounts = pcounts.drop('barcode', axis=1)


In [None]:
#Normalize UMI count in each bead
cell_totalUMI = np.sum(pcounts, axis = 1)#sum along rows
pcounts_cellnorm = np.true_divide(pcounts, cell_totalUMI[:,None])#normalize per total umi count
pcounts_scaled = StandardScaler(with_mean=False).fit_transform(pcounts_cellnorm)


In [None]:
#same as above for atlas data
cell_totalUMIa = np.sum(atlasdge, axis = 1)
atlasdge_cellnorm = np.true_divide(atlasdge, cell_totalUMIa[:,None])
atlasdge_scaled = StandardScaler(with_mean=False).fit_transform(atlasdge_cellnorm)

In [None]:
#observe how many cells are assigned to the clusters (from file)
cell_clusters.cluster.value_counts()

In [None]:
#Perform NMF on the atlas data to find basis for projection
#NMF allows us to quantify how likely that a bead is a certain cell type (in the case of a bead being multiple cells,
#NMF tells how much each cell type contributed to a given bead)

K = 23 #Want k to be larger than the number of clusters but smaller than the number of cells for reduced dimensionality
alpha = 0
l1_ratio = 0
random_state = 17
model = NMF(n_components=K, init='random', random_state = random_state, alpha = alpha, l1_ratio = l1_ratio)

# Decomposed matrixes
Ha = model.fit_transform(atlasdge_scaled)#weights
Wa = model.components_ #basis vectors

Ha_norm = StandardScaler(with_mean=False).fit_transform(Ha)
Ha_norm = pd.DataFrame(Ha_norm)
Ha_norm['barcode'] = atlasdge.index.tolist()
    

maxloc = Ha_norm.drop('barcode', axis=1).values.argmax(axis=1)#location of maximum gene expression in each row
cell_clusters['maxloc'] = maxloc


In [None]:
#Assign cell types to each factor from NMF
#plots telling how much each cluster type goes into each component (linear combinations)

num_atlas_clusters = max(cell_clusters['cluster'])+1
bins_n = num_atlas_clusters
factor_to_celltype_df = pd.DataFrame(0, index=range(0, num_atlas_clusters), columns=range(K))

#plot how much each cell type contributes to a factor
for k in range(K):
    print ("cell type assignment for {0} th cluster".format(k))
    n, bins, patches = plt.hist(cell_clusters['cluster'][cell_clusters['maxloc'] == k],
            bins_n, range = (-0.5,(bins_n)-.5), facecolor='green', alpha=0.75)
    plt.xticks(np.arange(20))
    plt.show()
    factor_to_celltype_df[k] = n.astype(int)
    print(n)
    print(bins)

In [None]:
#Generates a heatmap (using hierarchichal clustering)
factor_to_celltype_df = factor_to_celltype_df.T

factor_total = np.sum(factor_to_celltype_df, axis = 1)#sum of weights for each factor

factor_to_celltype_df_norm = np.true_divide(factor_to_celltype_df, factor_total[:,None])

plt.figure()

cx = sns.clustermap(factor_to_celltype_df_norm, fmt = 'd',
                cmap="magma_r", linewidth=0.5, col_cluster = False,
                   figsize=(10, 15))
ax = sns.clustermap(factor_to_celltype_df_norm, fmt = 'd',
                cmap="magma_r", linewidth=0.5, col_cluster = False,
                   annot = factor_to_celltype_df.loc[cx.dendrogram_row.reordered_ind],
                   figsize=(10, 20))
plt.show()
plt.close()

factor_total = np.sum(factor_to_celltype_df, axis = 1)
factor_to_celltype_df_norm = np.true_divide(factor_to_celltype_df, factor_total[:,None])


In [None]:
#Create factor to cell type and cell type to factor dictionaries
maxloc_fc = factor_to_celltype_df.values.argmax(axis=1)
factor_to_celltype_dict = {factor : ctype for factor, ctype in enumerate(maxloc_fc)}

celltype_to_factor_dict = {}
for c in range(0, num_atlas_clusters):
    celltype_to_factor_dict[c] = [k for k, v in factor_to_celltype_dict.items() if v == c]

In [None]:
#Assign each bead a to a cell type

#Perform NNLS with the atlas basis
WaT = Wa.T
XsT = pcounts_scaled.T

Hs_hat = []
for b in tqdm(range(XsT.shape[1])):
    h_hat = scipy.optimize.nnls(WaT, XsT[:, b])[0]
    if b == 0:
        Hs_hat = h_hat
    else:
        Hs_hat = np.vstack((Hs_hat, h_hat))

Ha = pd.DataFrame(Ha)
Ha['cellname'] = atlasdge.index.tolist()
Ha_indexed = Ha.set_index('cellname')

Hs = pd.DataFrame(Hs_hat)
Hs['barcode'] = pcounts.index.tolist()
Hs_indexed = Hs.set_index('barcode')

Hs_indexed.to_csv("{}Hs{}_{}_{}_{}.csv".format(NMFreg_output, K, alpha, l1_ratio, random_state), index=True)

#Scale the Ha and Hs matrices to unit variance
Ha_norm = pd.DataFrame(StandardScaler(with_mean=False).fit_transform(Ha_indexed))
Hs_norm = pd.DataFrame(StandardScaler(with_mean=False).fit_transform(Hs_indexed))

#Re-add indices after scaling
Ha_norm['cellname'] = atlasdge.index.tolist()
Ha_norm_indexed = Ha_norm.set_index('cellname')
Hs_norm['barcode'] = pcounts.index.tolist()
Hs_norm_indexed = Hs_norm.set_index('barcode')

#Assign barcodes a cluster assignment from the atlas data set based on max factors
maxloc_s = Hs_norm_indexed.values.argmax(axis=1)
barcode_clusters = pd.DataFrame()
barcode_clusters['barcode'] = Hs_norm_indexed.index.tolist()
barcode_clusters['max_factor'] = maxloc_s
barcode_clusters['atlas_cluster'] = barcode_clusters['barcode']


for c in range(0, num_atlas_clusters):
    condition = np.in1d(barcode_clusters['max_factor'], celltype_to_factor_dict[c])
    barcode_clusters['atlas_cluster'][condition] = c


In [None]:
#number of times that celltype is the max factor
barcode_clusters['atlas_cluster'].value_counts()

In [None]:
#Plot the assignments onto the puck
for i in range(0,num_atlas_clusters):
    boolcol = (barcode_clusters['atlas_cluster']==i)
    sub_df = barcode_clusters.copy()
    sub_df['bool'] = boolcol
    plt.figure(figsize =(10, 10))
    plt.set_cmap('nipy_spectral_r')
    plt.scatter(coords['xcoord'], coords['ycoord'], c=sub_df['bool'], marker = 's', s = 720/1000,alpha=0.6)
    plt.title(tissue_name + ' atlas cluster' + str(i))
    plt.rc('xtick', labelsize=30)     
    plt.rc('ytick', labelsize=30)
    plt.xlabel('x (x1e3)',fontsize=32)
    plt.ylabel('y (x1e3)',fontsize=32)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.axis('equal')  
    #plt.savefig("c5_Puck_181206_3.svg",format="svg")
    plt.show()
    plt.close()


In [None]:
#combining factor contributions into cell type columns via l2 norm
def deconv_factor_to_celltype(row, adict, K=K, nc = num_atlas_clusters):
    tmp_list = [0]*nc
    for key in range(K):
        item = adict[key]
        tmp_list[item] += row[key]**2
    return pd.Series(np.sqrt(tmp_list))

In [None]:
#based on the cell types in each factor, we get how much of each cell type is being expressed in each bead
bead_deconv_df = Hs_norm.apply(lambda x: deconv_factor_to_celltype(row=x, adict=factor_to_celltype_dict), axis = 1)
bead_deconv_df.insert(0, 'barcode', Hs_norm['barcode'])
bead_deconv_df.columns = ['barcode'] + (bead_deconv_df.columns[1:]).tolist()
bead_deconv_df = pd.DataFrame(bead_deconv_df) 
maxloc_ct = bead_deconv_df.drop('barcode', axis=1).values.argmax(axis=1)
bead_maxct_df = pd.DataFrame()
bead_maxct_df['barcode'] = bead_deconv_df['barcode']
bead_maxct_df['max_cell_type'] = maxloc_ct
bead_maxct_df = pd.DataFrame()
bead_maxct_df['barcode'] = bead_deconv_df['barcode']
bead_maxct_df['max_cell_type'] = maxloc_ct#choosing the max value so that we get the most likely type that cell is in the bead

In [None]:
co = collections.Counter(factor_to_celltype_dict.values())
co

In [None]:
celltype_dict = {x: x for x in range(0,num_atlas_clusters)}
metacell_dict = celltype_dict

In [None]:
#different deconvolution methods (get cell type info from factors)
def deconv_factor_to_celltype_sum(row, adict, K=K, nc=num_atlas_clusters):
    tmp_list = [0]*nc
    for key in range(K):
        item = adict[key]
        tmp_list[item] += row[key]
    return pd.Series(tmp_list)

def deconv_factor_to_celltype_l2(row, adict, K=K, nc=num_atlas_clusters):
    tmp_list = [0]*nc
    for key in range(K):
        item = adict[key]
        tmp_list[item] += row[key]**2
    return pd.Series(np.sqrt(tmp_list))

#mean broken because of collections, probably. inconsistent matrix size?
def deconv_factor_to_celltype_mean(row, adict, K=K, nc=num_atlas_clusters):
    tmp_list = [0]*nc
    for key in range(K):
        item = adict[key]
        tmp_list[item] += row[key]
    num_fact = list(collections.OrderedDict(sorted(collections.Counter(adict.values()).items())).values()) 
    mean_tmp_list = np.divide(tmp_list, num_fact)
    return pd.Series(mean_tmp_list)



In [None]:
#choosing which way to deconvolute cell info from factors
def cell_deconv(collapse):
    
    if(collapse=='l2'):
        tmp_df = Ha_norm.drop('cellname', axis=1).apply(lambda x: deconv_factor_to_celltype_l2(row=x, adict=factor_to_celltype_dict), axis = 1)
    
    if(collapse=='sum'):
        tmp_df = Ha_norm.drop('cellname', axis=1).apply(lambda x: deconv_factor_to_celltype_sum(row=x, adict=factor_to_celltype_dict), axis = 1)
    
    if(collapse=='mean'):
        tmp_df = Ha_norm.drop('cellname', axis=1).apply(lambda x: deconv_factor_to_celltype_mean(row=x, adict=factor_to_celltype_dict), axis = 1)
    
    tmp_df.insert(0, 'cellname', Ha_norm['cellname'])
    tmp_df.columns = ['cellname'] + (tmp_df.columns[1:]).tolist()
    print(tmp_df.columns)
    tmp_df = pd.DataFrame(tmp_df)
    tmp_df = tmp_df.rename(columns = celltype_dict)

    maxloc_cellt = tmp_df.drop('cellname', axis=1).values.argmax(axis=1)
    cell_maxct_df = pd.DataFrame()
    cell_maxct_df['cellname'] = tmp_df['cellname']
    cell_maxct_df['max_cell_type'] = maxloc_cellt

    mismatch_df = cell_clusters[cell_maxct_df['max_cell_type'] != cell_clusters['cluster']]
    print('num mismatched: {}'.format(cell_clusters[cell_maxct_df['max_cell_type'] != cell_clusters['cluster']].shape[0]))

    figsize(4,4)
    plt.hist(mismatch_df['cluster'])
    plt.show()

    return tmp_df, mismatch_df, cell_maxct_df

In [None]:
#
cell_clusters = cell_clusters.reset_index()
cell_clusters.columns = ['index','barcode','cluster','maxloc']
cell_deconv_df, mismatch_dfl2, cell_maxct_df = cell_deconv(collapse='l2')
cell_maxct_df.max_cell_type.value_counts()

In [None]:
#normalizing deconv values (sum rows to one, i.e. the values in each bead sum to 1)
cell_totalloading = np.sum(cell_deconv_df.drop('cellname', axis=1), axis = 1)
cell_deconv_df_norm = np.true_divide(cell_deconv_df.drop('cellname', axis=1), cell_totalloading[:,None])
cell_deconv_df_norm['cellname'] = cell_deconv_df['cellname']

In [None]:
def plot_bar_cellt(cell_deconv_df_norm, cell_maxct_df,
                metacell_dict=metacell_dict):
    for key, value in metacell_dict.items():
        ct_df = cell_deconv_df_norm[cell_maxct_df['max_cell_type']==int(key)]
        figsize(4, 4)
        plt.bar(x=range(num_atlas_clusters),height=np.sum(ct_df.drop(['cellname'], axis=1), axis=0),
               tick_label = list(metacell_dict.values()))
        plt.title(value)
        plt.xticks(rotation=90)
        # save_result
        plt.show()

In [None]:
#what proportion of beads were identified as the respective cell type in the plot
plot_bar_cellt(cell_deconv_df_norm=cell_deconv_df_norm, cell_maxct_df=cell_maxct_df)

In [None]:
def plot_hist_TF(metacell_dict=metacell_dict):
    posneg_dict = {}
    for key, value in metacell_dict.items():
        pos = cell_deconv_df_norm[value][cell_maxct_df['max_cell_type']==int(key)]
        neg = cell_deconv_df_norm[value][cell_maxct_df['max_cell_type']!=int(key)]
        posneg_dict[key] = [pos, neg]
        figsize(4,4)
        plt.hist(pos, range=(0,1), color='green', alpha=0.6, density=True)
        plt.hist(neg, range=(0,1), color='red', alpha=0.6, density=True)
        plt.title(value)
        plt.xticks(rotation=90)
        # save_result
        plt.show()
    return posneg_dict
posneg_dict = plot_hist_TF()

In [None]:
#determine the maximum value of "negative" cell type expression
thresh_certainty = [0]*num_atlas_clusters
for c in range(0, num_atlas_clusters):
    thresh_certainty[c] = np.max(posneg_dict[c][1])
thresh_certainty


In [None]:
def func_thresh_certainty(bead_deconv_df_norm, keep_thresh_df):
    for key, value in metacell_dict.items():
        bool_df = keep_thresh_df[int(key)]
        ct_indx = list(bead_deconv_df_norm.index[bool_df.index])
        bead_deconv_df_norm['thresh_ct'].ix[ct_indx] = np.multiply(bead_deconv_df_norm['maxval'].ix[ct_indx], bool_df)
        
    return bead_deconv_df_norm

In [None]:
plot_size_dict = {10:4, 20:8, 40:20, 100:140}

In [None]:
def deconv_factor_to_celltype(row, adict, K=K, nc = num_atlas_clusters):
    tmp_list = [0]*nc
    for key in range(K):
        item = adict[key]
        tmp_list[item] += row[key]**2
    return pd.Series(np.sqrt(tmp_list))

In [None]:
def plot_certainty_thresh(coords, size, bead_deconv_df_norm,
                plot_size_dict=plot_size_dict):
    bool_col = bead_deconv_df_norm['thresh_ct']==0
    figsize(12, 12)
    plt.set_cmap('Reds')
    plt.scatter(coords['x'], coords['y'], c=bead_deconv_df_norm['maxval'], 
                s=plot_size_dict[size], alpha=1)
    plt.colorbar();
    plt.scatter(coords[bool_col]['xcoord'], 
                    coords[bool_col]['ycoord'], 
                    c='lightgray', s=plot_size_dict[size], alpha=1)
    plt.title('Purity of most prevalent cell type per bead, {}um'.format(size))
    plt.xlabel('Single celltype beads: {}%'.format(round(100*np.divide(coords[bead_deconv_df_norm['thresh_ct']!=0].shape[0],coords.shape[0]), 2)))
    plt.axis('equal')
    plt.clim(0,1);
    #save_result("{}ThreshCertaintyofmaxct".format(size))
    plt.show()

In [None]:
barcode_totalloading = np.sum(bead_deconv_df.drop('barcode', axis=1), 
                                  axis = 1)
bead_deconv_df_norm = np.true_divide(bead_deconv_df.drop('barcode', axis=1), 
                                         barcode_totalloading[:,None])
bead_deconv_df_norm['maxval'] = bead_deconv_df_norm.apply(max, axis=1)
bead_deconv_df_norm['barcode'] = pcounts.index.tolist()

deconv_sub_df = bead_deconv_df_norm.drop('barcode', axis=1)
bead_deconv_df_norm['max_cell_type'] = bead_maxct_df['max_cell_type']

In [None]:
bead_deconv_df_norm.max_cell_type.value_counts()

In [None]:
df_clust = pd.DataFrame(columns=['x','y','label'])
df_clust['x'] = coords['xcoord']
df_clust['y'] = coords['ycoord']
df_clust['label'] = bead_deconv_df_norm['max_cell_type']

In [None]:
df_clust.label.value_counts()

In [None]:
#plots to show how likely a bead is to be classified a given cell type
for indx, col in deconv_sub_df.iteritems():
    figsize(10, 10)
    plt.set_cmap('viridis_r')
    plt.scatter(coords['xcoord'], coords['ycoord'], c=bead_deconv_df_norm[indx], 
                    s=1, alpha=0.6)
    plt.title('cluster {0} DKD'.format(indx,tissue_status),fontsize=35)
    plt.axis('equal')
    plt.rc('xtick', labelsize=15)     
    plt.rc('ytick', labelsize=15)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.colorbar();
    plt.clim(0,1);
    #save_result("{}loading{}".format(size, indx))
    plt.show()

In [None]:
def maxval_func(row): return row[bead_deconv_df_norm[str(row['max_cell_type'])]]

In [None]:
def plot_boolean_thresh(size, coords, bead_maxct_df, bead_deconv_df_norm,
                 plot_size_dict=plot_size_dict,
                 metacell_dict=metacell_dict):
    bool_col = bead_deconv_df_norm['thresh_ct']==0
    print(bool_col)
    for key, value in metacell_dict.items():
        boolcol = bead_maxct_df['max_cell_type']==int(key)
        sub_df = bead_maxct_df.copy()
        sub_df['bool'] = boolcol
        bool_col_ct = np.multiply(bool_col, boolcol)
        print(bool_col_ct)

        figsize(12, 12)
        plt.set_cmap('copper_r')
        plt.scatter(coords['xcoord'], coords['ycoord'], c=sub_df['bool'], s=plot_size_dict[size], alpha=0.6)
        plt.scatter(coords[bool_col_ct]['xcoord'], 
                    coords[bool_col_ct]['ycoord'], 
                    c='lightgray', s=plot_size_dict[size], alpha=1)
        plt.title('{} {}um'.format(value, size))
        plt.axis('equal')
        #save_result("{}boolean_thresh{}".format(size, value))
        plt.show()

In [None]:
def plot_certainty_perct_thresh(coords, size, bead_deconv_df_norm, df_clust, bead_maxct_df,
                plot_size_dict=plot_size_dict):
    keep_thresh_df = {}
    remove_thresh_df = {}
    bead_deconv_df_norm['max_cell_type'] = bead_maxct_df['max_cell_type']
    
    for key, value in metacell_dict.items():
        bool_col = df_clust['label']==int(key)
        ct_df = bead_deconv_df_norm[bool_col]
        ct_df['col'] = ct_df['maxval'].apply(lambda x: 0 if x <= thresh_certainty[int(key)-1] else x)
        keep_thresh_df[int(key)-1] = ct_df['maxval'] > thresh_certainty[int(key)-1]
        remove_thresh_df[int(key)-1] = ct_df['maxval'] <= thresh_certainty[int(key)-1]
        
        figsize(12, 12)
        plt.set_cmap('Reds')
        plt.scatter(df_clust['x'], df_clust['y'], c='white', 
                    edgecolors='gray', linewidths=0.25, 
                    s=plot_size_dict[size], alpha=0.6)
        plt.scatter(coords[bool_col]['xcoord'], coords[bool_col]['ycoord'], 
                    c=ct_df['col'], s=plot_size_dict[size], alpha=0.9)
        plt.colorbar();
        plt.scatter(coords[bool_col]['xcoord'][ct_df['col']==0], 
                    coords[bool_col]['ycoord'][ct_df['col']==0], 
                    c='gray', s=plot_size_dict[size], alpha=0.4)
        
        plt.title('Purity of {} per bead, {}um'.format(value, size))
        plt.axis('equal')
        plt.clim(0,1);
        #save_result("{}ThreshCertaintyofmaxct{}".format(size, value))
        plt.show()
        
    return keep_thresh_df, remove_thresh_df

In [None]:
def func_thresh_certainty(bead_deconv_df_norm, keep_thresh_df):
    for key, value in metacell_dict.items():
        bool_df = keep_thresh_df[int(key)-1]
        ct_indx = list(bead_deconv_df_norm.index[bool_df.index])
        bead_deconv_df_norm['thresh_ct'].ix[ct_indx] = np.multiply(bead_deconv_df_norm['maxval'].ix[ct_indx], bool_df)
        
    return bead_deconv_df_norm

In [None]:
def plot_allct(coords, size, bead_deconv_df_norm, bead_maxct_df,
                plot_size_dict=plot_size_dict):
    bead_deconv_df_norm['max_cell_type'] = bead_maxct_df['max_cell_type']

    df_clust = pd.DataFrame(columns=['x','y','label'])
    df_clust['x'] = coords['xcoord']
    df_clust['y'] = coords['ycoord']
    df_clust['label'] = bead_deconv_df_norm['max_cell_type']

    facet = sns.lmplot(data=df_clust, x='x', y='y', hue='label', 
                       fit_reg=False, legend=False, legend_out=True,
                       palette = sns.color_palette("tab20", int(num_atlas_clusters)),
                       size = 10, scatter_kws={"s": 2*plot_size_dict[size]})
    #add a legend
    leg = facet.ax.legend(bbox_to_anchor=[1, 0.75],
                             title="label", fancybox=True)
    #change colors of labels
    for i, text in enumerate(leg.get_texts()):
        plt.setp(text, color = sns.color_palette("tab20", int(num_atlas_clusters))[i])
    #save_result("{}all_celltypes".format(size))
    plt.show()
    return df_clust, bead_deconv_df_norm

In [None]:
df_clust.index

In [None]:
for key, value in metacell_dict.items():
    bool_col = (df_clust['label']==int(key))
    print(bool_col)
    ct_df = bead_deconv_df_norm[bool_col]

In [None]:
df_clust.label.value_counts()

In [None]:
keep_thresh_df, remove_thresh_df = plot_certainty_perct_thresh(coords=coords, size=10, bead_deconv_df_norm=bead_deconv_df_norm, df_clust=df_clust, bead_maxct_df = bead_maxct_df)

#need more memory for this (only outputted 17 plots)



In [None]:
bead_deconv_df_norm['thresh_ct'] = bead_deconv_df_norm['maxval']
bead_deconv_df_norm = func_thresh_certainty(bead_deconv_df_norm=bead_deconv_df_norm, 
                                         keep_thresh_df=keep_thresh_df)

plot_boolean_thresh(size=10, coords=coords, bead_maxct_df=bead_maxct_df,bead_deconv_df_norm=bead_deconv_df_norm)

In [None]:
df_clust, bead_deconv_df_norm = plot_allct(coords=coords, size=10, 
                bead_deconv_df_norm=bead_deconv_df_norm, 
                bead_maxct_df=bead_maxct_df)
                                           
                        
df_clust.to_csv("{}/df_clust10.csv".format(NMFreg_output), index=False)
bead_deconv_df_norm.to_csv("{}/bead_deconv_df_norm.csv".format(NMFreg_output), index=False)

In [None]:
genes=pcounts.columns.tolist()
pcounts_scaled_df=pd.DataFrame(pcounts_scaled)
pcounts_scaled_df.set_axis(genes, axis=1, inplace=True)

### Creates csv files for future use by plotting/analysis pipeline

In [None]:
#creates csv for permutation test methods
pcounts_perm_test=pd.DataFrame(pcounts_scaled_df)
pcounts_perm_test['xcoord']=df_clust['x']
pcounts_perm_test['ycoord']=df_clust['y']
pcounts_perm_test['cluster']=df_clust['label']

f1 = Perm_test_output + pkn + ".csv"
pcounts_perm_test.to_csv(f1, index=True)

In [None]:
#creates csv for interactive plotting  
pcounts_webpage=pd.DataFrame(pcounts_scaled_df)
pcounts_webpage['xcoord']=coords['xcoord']
pcounts_webpage['ycoord']=coords['ycoord']

f1 = Interactive_plot_output + pkn + "_webpage_data.csv"
pcounts_webpage.to_csv(f1, index=True)

In [None]:
#plot single genes from your sample
def plot_one_gene(gene):
    figsize(13, 10)
    pyplot.set_cmap('viridis_r')
    #plt.scatter(coords['xcoord'], coords['ycoord'], c=pcounts[gene], s=2, alpha=0.6)
    #testing something
    plt.scatter(coords['xcoord'], coords['ycoord'], c=pcounts_scaled_df[gene], s=2, alpha=0.6)
    # BREANNA: other version uses "counts" instead of pcounts
    plt.axis('equal')
    plt.title('{}'.format(gene),fontsize=35)
    plt.rc('xtick', labelsize=10)     
    plt.rc('ytick', labelsize=10)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.colorbar();
    #plt.clim(0,12)
    #save_result(gene)
    plt.savefig("test.svg",format="svg")
    plt.show()
    
#interesting_genes = ['Aqp5']
#interesting_genes = ['Ociad2']
interesting_genes = ['Aqp1', 'Aqp2', 'Aqp6', 'Atp11a', 'Atp6vo1b2', 'Bst2', 'C1qa', 'C1qb',
       'Ccl11', 'Ccl17', 'Ccl2', 'Ccl20', 'Ccl22', 'Ccl3', 'Ccl4', 'Ccl5',
       'Ccr7', 'Cd40', 'Cd80', 'Cd86', 'Ctgf', 'Cxcl1', 'Cxcl10', 'Cxcl11',
       'Cxcl13', 'Cxcl5', 'Cxcl9', 'Ehd3', 'Enpp2', 'GAPDH', 'GFP',
       'Gapdh_mouse', 'H2-Ab1', 'H2-Eb1', 'Ifitm3', 'Ifnb1', 'Il10', 'Il12a',
       'Il12b', 'Il13', 'Il17', 'Il1a', 'Il1b', 'Il2', 'Il4', 'Il6', 'Il8',
       'Il9', 'Isg15', 'Itga8', 'KDR', 'LRP2', 'Miox', 'Muc1', 'Mx1', 'Mx2',
       'NM_009735_B2m', 'NM_013556_Hprt', 'Napsa', 'Nphs2', 'Oasl1', 'Pecam1',
       'Plvap', 'Ptn', 'Rpl13a', 'Rps29', 'Rsad2', 'Slc12a1', 'Slc22a7',
       'Slc34a1', 'Synpo', 'Tnf', 'Tnfaip3', 'UMOD', 'Wt1']

plot_one_gene(gene='Aqp1')

#for g in interesting_genes:
    #if g in pcounts.columns:# BREANNA: this conditional statement not present in other version
        #print(g) # BREANNA: no print statement in other version
        #plot_one_gene(gene=g)

In [None]:
#barcode vs original cell name.... where?
coords.barcode.str.contains("Puck").sum() #... no??

In [None]:
coords.barcode.str.contains("").sum()

In [None]:
# #ok everything here is for DKD -> output them, so that we can later compare
# #beads_deconv for cluster assignment
# #coords for coordinates
# #pcount for individual cell inspection
# pkn = "Puck_181206_3"
# #mydir = "/broad/finucanelab/qingbow/slseq/sumdata/"
# # Breanna changed for her directory
# mydir="/broad/thechenlab/breanna/breanna_slideseq_sumdata/"
# f1 = mydir + pkn + "coords.tsv"
# f2 = mydir + pkn + "pcount.tsv"
# f3 = mydir + pkn + "bead_deconv_df_norm.tsv"
# coords.to_csv(f1, index=True)
# print ("1 done")
# pcounts.to_csv(f2, index=True)
# print ("2 done")
# bead_deconv_df_norm.to_csv(f3, index=True)
# print ("3 done")

# #OK move to the wt and do the same

In [None]:
keep_thresh_df10, remove_thresh_df10 = plot_certainty_perct_thresh(coords=coords, 
                                          size=10, 
                                          bead_deconv_df_norm=bead_deconv_df_norm, 
                                          df_clust=df_clust,
                                          bead_maxct_df=bead_maxct_df)