In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
import seaborn as sns
import pandas.util.testing as tm
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
import matplotlib.patches as mpatches
import glob
from IPython.display import display
from tqdm import tqdm_notebook as tqdm
import scanpy as sc
%matplotlib inline

### Define functions

In [None]:
#convert spatial coordinates into array 
def coords_to_arr(bc_loc_df):
    coords_arr = bc_loc_df.loc[:,'x':'y'].to_numpy()
    return coords_arr

#perform nearest neighbor analysis and generate neighbor indices df
def nbrs_df(coords_arr, k):
    #calculate n nearest neighbors
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(coords_arr)
    distances, indices = nbrs.kneighbors(coords_arr)
    
    #create df with indices of nearest neighbors 
    nbrs_inds = pd.DataFrame(indices)
    return nbrs_inds

#create list of windows with cell type counts
def nbr_wind_dfs(nbrs_inds, bc_cell_type):
    num_arr = [None]*len(nbrs_inds)
    for index, row in nbrs_inds.iterrows():
        num_arr[index] = pd.DataFrame(bc_cell_type.loc[row])
    return num_arr

#calculate frequencies of cell types given cell type counts in list of windows 
#param: df with raw counts data -> i.e. num_arr[i]
#return: list of cell type frequencies where index in list is cell type number
def calc_freq(cell_type_counts):
    
    #convert cell type assignments into list
    row_nums = []
    for index, row in cell_type_counts.iterrows(): #calculate frequency for each row
        row_nums.append(row['Sub_cell_type'])
    
    #calculate frequency for each row
    row_freq = {}
    for n in row_nums:
        row_freq[n] = row_freq.get(n, 0) + 1
    
    #store frequencies of cell type
    freq_lst = []
    
    #add to freqs list
    for ct in range(1,num_cell_types+1):
        if ct in row_freq.keys():
            freq_lst.append(row_freq[ct]/k)
        else:
            freq_lst.append(0)
    return freq_lst

#calculate frequency of cell types within each window
def ct_freq_wind(num_arr):
    wind_freq = []
    for window in num_arr:
        new_wind = calc_freq(window)
        wind_freq.append(new_wind)
    return wind_freq

#conver wind_freq to dataframe
def df_convert(wind_freq):
    df_wind_freq = pd.DataFrame(data=wind_freq, columns=["ES","RS","Myoid","SPC","Differentiating SPG","Sertoli","Leydig","Endothelial","Macrophage", "Undifferentiated SPG"])
    return df_wind_freq

### Import file

In [None]:
# This file contains the cell type (including the undifferentiated and differentiating SPGs) and the bead location information of the all the beads.
df_ct_cluster_stage_SPG_cln = pd.read_csv('filename.csv', index_col=0)
print(df_ct_cluster_stage_SPG_cln.shape)
df_ct_cluster_stage_SPG_cln.head(3)

### Calculate spatial contact frequency for the SPGs

In [None]:
cell_types ={'1':"ES",
                '2':"RS",
                '3':"Myoid",
                '4':"SPC",
                '5':"Differentiating SPG",
                '6':"Sertoli",
                '7':"Leydig",
                '8':"Endothelial",
                '9':"Macrophage",
                '10':"Undifferentiated SPG"}

num_cell_types = len(cell_types)

In [None]:
#define window size
k = 5

In [None]:
bc_cell_type = df_ct_cluster_stage_SPG_cln['Sub_cell_type']
coords_arr = coords_to_arr(df_ct_cluster_stage_SPG_cln)
nbrs_inds = nbrs_df(coords_arr, k)
num_arr = nbr_wind_dfs(nbrs_inds, bc_cell_type)
wind_freq = ct_freq_wind(num_arr)
df_wind_freq = df_convert(wind_freq)
KNN_Cell_Freq = pd.concat([df_ct_cluster_stage_SPG_cln, df_wind_freq], axis =1)
KNN_Cell_Freq.head(3)

### Test if the distribution of cellular neighborhood compositions is the same between the undifferentiated and differentiating SPGs

In [None]:
SPG_Clustering = KNN_Cell_Freq
SPG_Clustering_sel = SPG_Clustering.loc[SPG_Clustering['Sub_cell_type'].isin([5,10])]

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(14,8), sharey=True, sharex=True)

sns.violinplot(ax=axs[0, 0], x="Sub_cell_type", y="ES", bw=1,data=SPG_Clustering_sel)
axs[0, 0].set_title('ES_K5')

sns.violinplot(ax=axs[0, 1],x="Sub_cell_type", y="RS", bw=1,data=SPG_Clustering_sel)
axs[0, 1].set_title('RS_K5')

sns.violinplot(ax=axs[0, 2],x="Sub_cell_type", y="Myoid", bw=1,data=SPG_Clustering_sel)
axs[0, 2].set_title('Myoid_K5')

sns.violinplot(ax=axs[0, 3],x="Sub_cell_type", y="SPC", bw=1,data=SPG_Clustering_sel)
axs[0, 3].set_title('SPC_K5')

sns.violinplot(ax=axs[0, 4],x="Sub_cell_type", y="Differentiating SPG", bw=1,data=SPG_Clustering_sel)
axs[0, 4].set_title('Differentiating SPG_K5')

sns.violinplot(ax=axs[1, 0],x="Sub_cell_type", y="Sertoli",bw=1, data=SPG_Clustering_sel)
axs[1, 0].set_title('Sertoli_K5')

sns.violinplot(ax=axs[1, 1],x="Sub_cell_type", y="Leydig", bw=1,data=SPG_Clustering_sel)
axs[1, 1].set_title('Leydig_K5')

sns.violinplot(ax=axs[1, 2],x="Sub_cell_type", y="Endothelial",bw=1, data=SPG_Clustering_sel)
axs[1, 2].set_title('Endothelial_K5')

sns.violinplot(ax=axs[1, 3],x="Sub_cell_type", y="Macrophage", bw=1,data=SPG_Clustering_sel)
axs[1, 3].set_title('Macrophage_K5')

sns.violinplot(ax=axs[1, 4],x="Sub_cell_type", y="Undifferentiated SPG", bw=1,data=SPG_Clustering_sel)
axs[1, 4].set_title('Undifferentiated SPG_K5')

In [None]:
SPG_Clustering_Diff = SPG_Clustering[SPG_Clustering['Sub_cell_type']==5]
SPG_Clustering_Undiff = SPG_Clustering[SPG_Clustering['Sub_cell_type']==10]

In [None]:
# Bootstrap strategy to build the null distribution of the p values.
Num_of_Permutation = 10000
pvals_dic = {}
for ct in tqdm(["ES","RS","Myoid","SPC","Differentiating SPG","Sertoli","Leydig","Endothelial","Macrophage", "Undifferentiated SPG"]):
    pvals_dic["pvals {0}".format(ct)] = np.zeros(Num_of_Permutation)
    for p in tqdm(range(Num_of_Permutation)):
        Diff_Beads_Random=np.random.choice(SPG_Clustering_Diff['barcode'].values, size = len(SPG_Clustering_Undiff),replace=False)
        SPG_Clustering_Diff_Random = SPG_Clustering_Diff[SPG_Clustering_Diff['barcode'].isin(Diff_Beads_Random)]
        pvals_dic["pvals {0}".format(ct)][p]=scipy.stats.ks_2samp(SPG_Clustering_Diff_Random[ct].values, SPG_Clustering_Diff[ct].values)[1]        

In [None]:
# Compare the real p value (i.e., undifferentiated SPGs vs. differentiating SPGs) with the null distribution of p values
n_bins = 20

fig, axs = plt.subplots(2, 5, figsize=(14,8), sharey=True, sharex=True)

axs[0, 0].hist(pvals_dic['pvals ES'], bins=n_bins)
axs[0, 0].set_title('ES_K5')
axs[0, 0].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['ES'].values, SPG_Clustering_Diff['ES'].values)[1], color='orange')
axs[0, 1].hist(pvals_dic['pvals RS'], bins=n_bins)
axs[0, 1].set_title('RS_K5')
axs[0, 1].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['RS'].values, SPG_Clustering_Diff['RS'].values)[1], color='orange')
axs[0, 2].hist(pvals_dic['pvals Myoid'], bins=n_bins)
axs[0, 2].set_title('Myoid_K5')
axs[0, 2].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Myoid'].values, SPG_Clustering_Diff['Myoid'].values)[1], color='orange')
axs[0, 3].hist(pvals_dic['pvals SPC'], bins=n_bins)
axs[0, 3].set_title('SPC_K5')
axs[0, 3].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['SPC'].values, SPG_Clustering_Diff['SPC'].values)[1], color='orange')
axs[0, 4].hist(pvals_dic['pvals Differentiating SPG'], bins=n_bins)
axs[0, 4].set_title('Differentiating SPG_K5')
axs[0, 4].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Differentiating SPG'].values, SPG_Clustering_Diff['Differentiating SPG'].values)[1], color='orange')
axs[1, 0].hist(pvals_dic['pvals Sertoli'], bins=n_bins)
axs[1, 0].set_title('Sertoli_K5')
axs[1, 0].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Sertoli'].values, SPG_Clustering_Diff['Sertoli'].values)[1], color='orange')
axs[1, 1].hist(pvals_dic['pvals Leydig'], bins=n_bins)
axs[1, 1].set_title('Leydig_K5')
axs[1, 1].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Leydig'].values, SPG_Clustering_Diff['Leydig'].values)[1], color='orange')
axs[1, 2].hist(pvals_dic['pvals Endothelial'], bins=n_bins)
axs[1, 2].set_title('Endothelial_K5')
axs[1, 2].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Endothelial'].values, SPG_Clustering_Diff['Endothelial'].values)[1], color='orange')
axs[1, 3].hist(pvals_dic['pvals Macrophage'], bins=n_bins)
axs[1, 3].set_title('Macrophage_K5')
axs[1, 3].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Macrophage'].values, SPG_Clustering_Diff['Macrophage'].values)[1], color='orange')
axs[1, 4].hist(pvals_dic['pvals Undifferentiated SPG'], bins=n_bins)
axs[1, 4].set_title('Undifferentiated SPG_K5')
axs[1, 4].axvline(scipy.stats.ks_2samp(SPG_Clustering_Undiff['Undifferentiated SPG'].values, SPG_Clustering_Diff['Undifferentiated SPG'].values)[1], color='orange')