# ES purity score

In [None]:
import pandas as pd
import pandas.util.testing as tm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
import matplotlib.patches as mpatches
import seaborn as sns
import glob
from IPython.display import display
from scipy.stats import mannwhitneyu
%matplotlib inline

In [None]:
cell_types ={'1':"ES",
                '2':"RS",
                '3':"Myoid",
                '4':"SPC",
                '5':"SPG",
                '6':"Sertoli",
                '7':"Leydig",
                '8':"Endothelial",
                '9':"Macrophage"}

num_cell_types = len(cell_types)

In [None]:
#names of spatial location dfs
path = "WT Tubule Directory/*.csv"
spac_locs_names_wt =[]
for fname in glob.glob(path):
    spac_locs_names_wt.append(fname)

In [None]:
path_1 = "Diabetic Tubule Directory/*.csv"
spac_locs_names_d = []
for fname in glob.glob(path_1):
    spac_locs_names_d.append(fname)

# Define methods

In [None]:
def read_df_list(df_names_list):
    df_list = []
    for i in range(len(df_names_list)):
        df_list.append(pd.read_csv(f'{df_names_list[i]}'))
    return df_list

In [None]:
#convert spatial coordinates into array 
def coords_to_arr(bc_loc_df):
    coords_arr = bc_loc_df.loc[:,'xcoord':'ycoord'].to_numpy()
    return coords_arr

In [None]:
#perform nearest neighbor analysis and generate neighbor indices df
def nbrs_df(coords_arr, k):
    #calculate n nearest neighbors
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(coords_arr)
    distances, indices = nbrs.kneighbors(coords_arr)
    
    #create df with indices of nearest neighbors 
    nbrs_inds = pd.DataFrame(indices)
    return nbrs_inds

In [None]:
#create list of windows with cell type counts
def nbr_wind_dfs(nbrs_inds, bc_cell_type):
    num_arr = [None]*len(nbrs_inds)
    for index, row in nbrs_inds.iterrows():
        num_arr[index] = pd.DataFrame(bc_cell_type.loc[row])
    return num_arr

In [None]:
#calculate frequencies of cell types given cell type counts in list of windows 
#param: df with raw counts data -> i.e. num_arr[i]
#return: list of cell type frequencies where index in list is cell type number
def calc_freq(cell_type_counts):
    
    #convert cell type assignments into list
    row_nums = []
    for index, row in cell_type_counts.iterrows(): #calculate frequency for each row
        row_nums.append(row['max_cell_type'])
    
    #calculate frequency for each row
    row_freq = {}
    for n in row_nums:
        row_freq[n] = row_freq.get(n, 0) + 1
    
    #store frequencies of cell type
    freq_lst = []
    
    #add to freqs list
    for ct in range(1,num_cell_types+1):
        if ct in row_freq.keys():
            freq_lst.append(row_freq[ct]/k)
        else:
            freq_lst.append(0)
    return freq_lst

In [None]:
#calculate frequency of cell types within each window
def ct_freq_wind(num_arr):
    wind_freq = []
    for window in num_arr:
        new_wind = calc_freq(window)
        wind_freq.append(new_wind)
    return wind_freq

In [None]:
#conver wind_freq to dataframe
def df_convert(wind_freq):
    df_wind_freq = pd.DataFrame(data=wind_freq, columns=["ES","RS","Myoid","SPC","SPG","Sertoli","Leydig","Endothelial","Macrophage"])
    df_wind_freq_filter = df_wind_freq[df_wind_freq.ES != 0]
    return df_wind_freq_filter

In [None]:
def count_purity(df_wind_freq_filter):
    purity = df_wind_freq_filter["ES"].mean()
    return purity

In [None]:
def count_ES(df):
    return (df.max_cell_type == 1).sum()

### Calculating ES purity score

In [None]:
#define window size
k = 5

In [None]:
#read in file names as dfs
spac_locs_dfs_wt = read_df_list(spac_locs_names_wt)

In [None]:
wt_purity_count=[]
wt_ES_count=[]

for df_ind in range(len(spac_locs_dfs_wt)):
    bc_loc = spac_locs_dfs_wt[df_ind]
    bc_cell_type = bc_loc['max_cell_type']
    ES_count = count_ES(bc_loc)
    wt_ES_count.append(ES_count)
    
    coords_arr = coords_to_arr(bc_loc)

    nbrs_inds = nbrs_df(coords_arr, k)
    num_arr = nbr_wind_dfs(nbrs_inds, bc_cell_type)
    wind_freq = ct_freq_wind(num_arr)
    
    df_wind_freq_filter = df_convert(wind_freq)
    
    purity = count_purity(df_wind_freq_filter)
    wt_purity_count.append(purity)
    
wt_purity_count
wt_ES_count

In [None]:
df_wt_purity = pd.DataFrame(data={"purity_score": wt_purity_count, 'ES_count': wt_ES_count})
df_wt_purity.head(3)

In [None]:
spac_locs_dfs_d = read_df_list(spac_locs_names_d)

In [None]:
D_purity_count=[]
D_ES_count=[]
for df_ind in range(len(spac_locs_dfs_d)):
    bc_loc = spac_locs_dfs_d[df_ind]
    bc_cell_type = bc_loc['max_cell_type']
    ES_count = count_ES(bc_loc)
    D_ES_count.append(ES_count)

    coords_arr = coords_to_arr(bc_loc)

    nbrs_inds = nbrs_df(coords_arr, k)
    num_arr = nbr_wind_dfs(nbrs_inds, bc_cell_type)
    wind_freq = ct_freq_wind(num_arr)
    
    df_wind_freq_filter = df_convert(wind_freq)
    
    purity = count_purity(df_wind_freq_filter)
    D_purity_count.append(purity)
    
D_purity_count
D_ES_count

In [None]:
df_D_purity = pd.DataFrame(data={"purity_score": D_purity_count, 'ES_count': D_ES_count})
df_D_purity.head(3)

In [None]:
sns.scatterplot(data=df_wt_purity, x="ES_count", y="purity_score", label = 'WT')
sns.scatterplot(data=df_D_purity, x="ES_count", y="purity_score", label = 'ob/ob')
plt.title('K=5')
plt.xlim(50,250)
plt.legend()

In [None]:
df_D_purity_select = df_D_purity[df_D_purity.ES_count >= 50]
df_wt_purity_select = df_wt_purity[df_wt_purity.ES_count >= 50]
df_D_purity_select.head(3)

In [None]:
Arr_D_purity_select = df_D_purity_select['purity_score'].values
Arr_wt_purity_select = df_wt_purity_select['purity_score'].values
stat, p = mannwhitneyu(Arr_wt_purity_select, Arr_D_purity_select)
print('Statistics=%.3f, p=%.7f' % (stat, p))