### Import scanpy

In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.12


# Load data

In [3]:
adata = sc.read_h5ad('backups_JZ_2022/clean_kidney_304_15tr_25c_batch_corrected_50236x2000_220315_12h14.h5ad') 

In [4]:
# overwrite obs with the most recent version
filename = 'backups_JZ_2022/clustering_304_15tr_25c_obs_info_50236x54_220315_18h11.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)
adata.obs = obs

In [5]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,pheno_leiden_res_3.5,pheno_leiden_res_4.0,sp_cl_38,sp_cl_39,sp_cl_40,sp_cl_42,sp_cl_43,sp_cl_45,sp_cl_47,sp_cl_50
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,10,9,33,20,18,18,40,25,7,19
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,10,9,35,33,32,33,37,34,40,46
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,25,11,21,24,19,21,22,29,21,24
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,17,11,31,35,34,37,27,38,28,31
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,8,23,1,37,39,39,28,23,45,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,35,33,28,13,30,1,1,15
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,33,20,28,13,30,1,1,15
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,35,33,32,33,37,34,40,15
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,17,56,31,35,34,37,27,38,8,44


### Scale (normalize) data

In [6]:
adata = adata.raw.to_adata()

In [8]:
# turn into counts per 10k
print(adata.X[:5,:].sum(axis=1))
print()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
print(adata.X[:5,:].sum(axis=1))

[[ 449.]
 [ 449.]
 [1229.]
 [ 432.]
 [ 502.]]

[[10000.001]
 [10000.   ]
 [10000.001]
 [10000.   ]
 [10000.   ]]


# Get enriched genes

Upon interactive exploration of the UMAP in the SPRING (Weinreb et al., 2018) application, it was decided that spectral clustering with parameter k=43 is the best representation to separate the cellular phenotypes present in the dataset.

In [9]:
#taking all cells now

cmask = np.repeat(True,adata.shape[0])
print(cmask.sum())

50236


In [10]:
#spectral clustering labels
for i in adata.obs:
    if i.startswith('sp_cl'):
        print (i)

sp_cl_38
sp_cl_39
sp_cl_40
sp_cl_42
sp_cl_43
sp_cl_45
sp_cl_47
sp_cl_50


In [11]:
# get centroids selecting the cluster configuration with k=43
thelabel = 'sp_cl_43'
centroids = rz.centroids(thelabel,adata[cmask])

In [12]:
# For each cluster, find genes that are statistically significantly higher or lower in cluster x compared to
# all other cells collectively

#label-free filter to remove low abundance genes - 
#gene has to be expressed in at least min_cells by at least min_counts
min_counts = 15
min_cells = 25

In [13]:
gmask = srz.filter_abund_genes(adata.X[cmask], min_counts, min_cells)

12015 genes passing abundance filter


In [20]:
mwu_dict = {}
start=time.time()
counter=0

meta = adata[cmask].obs
E = adata[cmask].X
gene_list = adata.var_names


for cluster in meta[thelabel].unique():
    counter+=1
    mask1 = (meta[thelabel]==cluster).values
    mask2 = mask1==False
    
    cg1 = np.array(E[:,gmask][mask1,:].todense())
    cg2 = np.array(E[:,gmask][mask2,:].todense())
    mwu_dict[cluster] = rz.mwu(cg1,cg2,genes=gene_list[gmask],print_progression=True)
    print("%d/%d"%(counter,len(meta[thelabel].unique())))
    print(cluster, 'done',cg1.shape[0]+cg2.shape[0])
print(time.time()-start)

fname = 'backups_JZ_2022/cluster_vs_rest_MWU_result_dict_%s%s'%(rz.now(), thelabel)
print(fname)
rz.save_stuff(mwu_dict,fname)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
1/43
40 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
2/43
37 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
3/43
22 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
4/43
27 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
5/43
28 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
6/43
4 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
7/43
31 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
8/43
30 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
9/43
42 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
10/43
39 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
11/43
1 done 50236
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
12/43
0 done 50236
1000
2000
3000
4

In [14]:
# if continuing from backup
#mwu_dict = rz.load_stuff('backups_JZ_2022/cluster_vs_rest_MWU_result_dict_220317_17h32sp_cl_43.pickle')

In [16]:
# select pseudovalue to add
pseudo = 1 # 1 counts per 10k

In [17]:
thelabel = 'sp_cl_43'

In [18]:
print(thelabel)

# fcdict - fold-change dictionary

fcdict = rz.get_fc_to_all_other(
        lab = thelabel,
        meta = adata[cmask].obs,
        E = adata[cmask].X,
        pseudo = pseudo,
        gene_list = adata.var_names,
        )

sp_cl_43


In [19]:
# leave only genes with a significant difference.
fcdictsig = {}

# before the mwu test, I prefiltered genes on abundance, apply this mask here as well
print(gmask.sum())
for key,value in fcdict.items():
    sigmask = (mwu_dict[key]['fdr']<0.05).values
    fcdictsig[key] = value[gmask][sigmask]
    print(key,sigmask.sum(),len(fcdictsig[key]))

12015
40 3735 3735
37 8670 8670
22 8999 8999
27 7080 7080
28 6355 6355
4 7476 7476
31 9785 9785
30 9219 9219
42 10434 10434
39 5776 5776
1 7015 7015
0 8978 8978
15 5661 5661
11 6526 6526
14 1070 1070
9 730 730
18 2016 2016
3 424 424
24 5482 5482
13 4815 4815
6 5232 5232
41 8279 8279
8 5101 5101
33 2375 2375
12 2180 2180
21 3857 3857
25 3875 3875
19 6728 6728
2 8455 8455
17 10569 10569
35 4205 4205
38 5883 5883
34 5234 5234
7 3247 3247
5 6518 6518
23 2867 2867
16 3511 3511
32 1039 1039
20 6152 6152
36 1548 1548
29 1122 1122
10 5344 5344
26 3836 3836


In [20]:
# nr genes to consider:
upto = 100 # up to 100 genes used to generate Table S1


frame = {}
for key,value in fcdictsig.items():
    s = value.sort_values(ascending=False)[:upto]
    key2 = str(key)+'_FC'
    frame[str(key)] = s.index
    frame[key2] = s.values
frame = pd.DataFrame(frame)
frame[[i for i in frame.columns if "FC" in i]].min() #ok, all above 1.

40_FC    2.047777
37_FC    1.612266
22_FC    2.181213
27_FC    1.822957
28_FC    2.247929
4_FC     2.241581
31_FC    1.892141
30_FC    2.109294
42_FC    1.546601
39_FC    1.577787
1_FC     2.847859
0_FC     2.887836
15_FC    2.108335
11_FC    2.203161
14_FC    1.655347
9_FC     1.897239
18_FC    2.617425
3_FC     1.515281
24_FC    2.309084
13_FC    2.008285
6_FC     2.503421
41_FC    1.753608
8_FC     2.352763
33_FC    1.613115
12_FC    2.226049
21_FC    2.618681
25_FC    1.599198
19_FC    2.762614
2_FC     3.505849
17_FC    3.045998
35_FC    2.551918
38_FC    2.065930
34_FC    2.282534
7_FC     2.650777
5_FC     2.422653
23_FC    2.727043
16_FC    2.520060
32_FC    2.152142
20_FC    2.413316
36_FC    2.291444
29_FC    2.564373
10_FC    2.407208
26_FC    2.652274
dtype: float32

In [21]:
outdir = 'outputs_JZ_2022/'

In [32]:
fname = outdir+'lists_enriched_genes_top_%d_%s_%s.xlsx'%(upto,thelabel,rz.now())
print(fname)
frame.to_excel(fname)

outputs_JZ_2022/lists_enriched_genes_top_100_sp_cl_43_220321_16h11.xlsx
