In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
import seaborn as sns
import pandas.util.testing as tm
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
import matplotlib.patches as mpatches
import glob
from IPython.display import display
from tqdm import tqdm_notebook as tqdm
import scanpy as sc
%matplotlib inline

### Define functions

In [None]:
#convert spatial coordinates into array 
def coords_to_arr(bc_loc_df):
    coords_arr = bc_loc_df.loc[:,'x':'y'].to_numpy()
    return coords_arr

#perform nearest neighbor analysis and generate neighbor indices df
def nbrs_df(coords_arr, k):
    #calculate n nearest neighbors
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(coords_arr)
    distances, indices = nbrs.kneighbors(coords_arr)
    
    #create df with indices of nearest neighbors 
    nbrs_inds = pd.DataFrame(indices)
    return nbrs_inds

#create list of windows with cell type counts
def nbr_wind_dfs(nbrs_inds, bc_cell_type):
    num_arr = [None]*len(nbrs_inds)
    for index, row in nbrs_inds.iterrows():
        num_arr[index] = pd.DataFrame(bc_cell_type.loc[row])
    return num_arr

#calculate frequencies of cell types given cell type counts in list of windows 
#param: df with raw counts data -> i.e. num_arr[i]
#return: list of cell type frequencies where index in list is cell type number
def calc_freq(cell_type_counts):
    
    #convert cell type assignments into list
    row_nums = []
    for index, row in cell_type_counts.iterrows(): #calculate frequency for each row
        row_nums.append(row['Sub_cell_type'])
    
    #calculate frequency for each row
    row_freq = {}
    for n in row_nums:
        row_freq[n] = row_freq.get(n, 0) + 1
    
    #store frequencies of cell type
    freq_lst = []
    
    #add to freqs list
    for ct in range(1,num_cell_types+1):
        if ct in row_freq.keys():
            freq_lst.append(row_freq[ct]/k)
        else:
            freq_lst.append(0)
    return freq_lst

#calculate frequency of cell types within each window
def ct_freq_wind(num_arr):
    wind_freq = []
    for window in num_arr:
        new_wind = calc_freq(window)
        wind_freq.append(new_wind)
    return wind_freq

#conver wind_freq to dataframe
def df_convert(wind_freq):
    df_wind_freq = pd.DataFrame(data=wind_freq, columns=["ES","RS","Myoid","SPC","Differentiating SPG","Sertoli","Leydig","Endothelial","Macrophage", "Undifferentiated SPG"])
    return df_wind_freq

### Import file

In [None]:
# This file contains the cell type (including the undifferentiated and differentiating SPGs) and the bead location information of the all the beads.
df_ct_cluster_stage_SPG_cln = pd.read_csv('filename.csv', index_col=0)
print(df_ct_cluster_stage_SPG_cln.shape)
df_ct_cluster_stage_SPG_cln.head(3)

### Calculate spatial contact frequency for the SPGs

In [None]:
cell_types ={'1':"ES",
                '2':"RS",
                '3':"Myoid",
                '4':"SPC",
                '5':"Differentiating SPG",
                '6':"Sertoli",
                '7':"Leydig",
                '8':"Endothelial",
                '9':"Macrophage",
                '10':"Undifferentiated SPG"}

num_cell_types = len(cell_types)

In [None]:
#define window size
k = 5

In [None]:
bc_cell_type = df_ct_cluster_stage_SPG_cln['Sub_cell_type']
coords_arr = coords_to_arr(df_ct_cluster_stage_SPG_cln)
nbrs_inds = nbrs_df(coords_arr, k)
num_arr = nbr_wind_dfs(nbrs_inds, bc_cell_type)
wind_freq = ct_freq_wind(num_arr)
df_wind_freq = df_convert(wind_freq)
KNN_Cell_Freq = pd.concat([df_ct_cluster_stage_SPG_cln, df_wind_freq], axis =1)
KNN_Cell_Freq.head(3)

### Select and output gene expression profiles and meta-data of beads in either the differentiating or undifferentiated SPG neighborhood for downstream differential expression analysis

In [None]:
# Read in the gene expression matrix of the whole Slide-seq array 
#(i.e., the MappedDGEF file provided in this link: https://www.dropbox.com/s/ygzpj0d0oh67br0/Testis_Slideseq_Data.zip?dl=0)
Puck_DGE = pd.read_csv('MappedDGEForR.csv', index_col=0)
print(Puck_T3_DGE.shape)
Puck_DGE.head(3)

In [None]:
df_ct_cluster_stage_SPG_cln_SPG_ONLY = df_ct_cluster_stage_SPG_cln.loc[df_ct_cluster_stage_SPG_cln['Sub_cell_type'].isin([5, 10])]
df_ct_cluster_stage_Undiff_SPG_ONLY = df_ct_cluster_stage_SPG_cln.loc[df_ct_cluster_stage_SPG_cln['Sub_cell_type'].isin([10])]
df_ct_cluster_stage_Diff_SPG_ONLY = df_ct_cluster_stage_SPG_cln.loc[df_ct_cluster_stage_SPG_cln['Sub_cell_type'].isin([5])]
Diff_ls = df_ct_cluster_stage_Diff_SPG_ONLY.index.values
Undiff_ls = df_ct_cluster_stage_Undiff_SPG_ONLY.index.values

In [None]:
# Generate a dictionary of gene expression profiles for every neighborhood surrounding either an undifferentiated
# or differentiating SPG.
SPG_set = set(df_ct_cluster_stage_SPG_cln_SPG_ONLY.index.values)
Neighbor_DGE_Dict = {}
for i in tqdm(range(len(nbrs_inds))):
    Neighbor_set = set(nbrs_inds.loc[i,].values)
    Difference_set = Neighbor_set.difference(SPG_set)
    df_neighbor = df_ct_cluster_stage_SPG_cln.iloc[list(Difference_set),:]
    Neighbor_DGE_Dict[i] = Puck_DGE.loc[list(df_neighbor['barcode'].values),:].reset_index()

Neighbor_DGE_Dict_Diff = [Neighbor_DGE_Dict[i] for i in Diff_ls]
Neighbor_DGE_Dict_Undiff = [Neighbor_DGE_Dict[i] for i in Undiff_ls]

In [None]:
# Convert dictionaries to dataframes.
Neighbor_DGE_Undiff_df = pd.concat(Neighbor_DGE_Dict_Undiff, ignore_index=True)
Neighbor_DGE_Diff_df = pd.concat(Neighbor_DGE_Dict_Diff, ignore_index=True)

In [None]:
# Deduplitate the dataframes and filter out beads that are shared by both the undifferentiated
# and differentiating SPG neighborhoods.
Neighbor_DGE_Undiff_df_dedup = Neighbor_DGE_Undiff_df.drop_duplicates().set_index('barcode')
Neighbor_DGE_Diff_df_dedup = Neighbor_DGE_Diff_df.drop_duplicates().set_index('barcode')
Undiff_dedup_set = set(Neighbor_DGE_Undiff_df_dedup.index.values)
Diff_dedup_set = set(Neighbor_DGE_Diff_df_dedup.index.values)
Neighbor_DGE_Undiff_df_deoverlap = Neighbor_DGE_Undiff_df_dedup.loc[list(Undiff_dedup_set.difference(Diff_dedup_set))]
Neighbor_DGE_Diff_df_deoverlap = Neighbor_DGE_Diff_df_dedup.loc[list(Diff_dedup_set.difference(Undiff_dedup_set))]

In [None]:
# Output neighboring bead gene expression matrix.
Neighbor_DGE_Comb_df_deoverlap = pd.concat([Neighbor_DGE_Undiff_df_deoverlap.T.add_prefix('Undiff_'), Neighbor_DGE_Diff_df_deoverlap.T.add_prefix('Diff_')],axis=1)
Neighbor_DGE_Comb_df_deoverlap.to_csv('DGE_SPG_Neighboring_Beads_K{}'.format(k)+'.csv')

In [None]:
Neighbor_ct_undiff = df_ct_cluster_stage_SPG_cln.loc[df_ct_cluster_stage_SPG_cln['barcode'].isin(Neighbor_DGE_Undiff_df_deoverlap.index.values)]
Neighbor_ct_undiff_cln = Neighbor_ct_undiff[['barcode', 'Stage_assigned', 'Sub_cell_type']]
Neighbor_ct_undiff_cln['Neighborhood'] = 'Undiff'
Neighbor_ct_undiff_cln['barcode'] = 'Undiff_' + Neighbor_ct_undiff_cln['barcode'].astype(str)

Neighbor_ct_diff = df_ct_cluster_stage_SPG_cln.loc[df_ct_cluster_stage_SPG_cln['barcode'].isin(Neighbor_DGE_Diff_df_deoverlap.index.values)]
Neighbor_ct_diff_cln = Neighbor_ct_diff[['barcode', 'Stage_assigned', 'Sub_cell_type']]
Neighbor_ct_diff_cln['Neighborhood'] = 'Diff'
Neighbor_ct_diff_cln['barcode'] = 'Diff_' + Neighbor_ct_diff_cln['barcode'].astype(str)

In [None]:
# Output neighboring bead meta-data
Neighbor_ct_comb_cln = pd.concat([Neighbor_ct_undiff_cln.set_index('barcode'), Neighbor_ct_diff_cln.set_index('barcode')])
Neighbor_ct_comb_cln.to_csv('Meta_Data_SPG_Neighboring_Beads_K{}'.format(k)+'.csv')