In [1]:
import scanpy as sc
import scanpy.external as sce
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
import sys
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.12


# Load data

Annotation is manual, based on extensive literature review. Top 25 marker genes with references proving it's expression in the annotated cell type for each cluster are provided in Supplementary file Table 1.

In [17]:
# SPRING plot path
path1 = '/Users/justina/Documents/mokslai/MAGISTRAS/MAGISTRINIS/data_and_spring/SPRING_dev-master/kidney_spring/' 
project_dir = path1+'kidney_2022/'
plot_name =  'clean_kidney_304_15tr_25c'



In [18]:
#load latest obs
filename = 'backups_JZ_2022/clustering_304_15tr_25c_obs_info_50236x54_220315_18h11.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)

In [19]:
obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,pheno_leiden_res_3.5,pheno_leiden_res_4.0,sp_cl_38,sp_cl_39,sp_cl_40,sp_cl_42,sp_cl_43,sp_cl_45,sp_cl_47,sp_cl_50
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,10,9,33,20,18,18,40,25,7,19
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,10,9,35,33,32,33,37,34,40,46
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,25,11,21,24,19,21,22,29,21,24
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,17,11,31,35,34,37,27,38,28,31
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,8,23,1,37,39,39,28,23,45,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,35,33,28,13,30,1,1,15
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,33,20,28,13,30,1,1,15
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,14,12,35,33,32,33,37,34,40,15
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,17,56,31,35,34,37,27,38,8,44


In [20]:
#clusters to annotate are in obs['sp_cl_43']
for i in range(0, 43):
    print('"%s":"",'%i)

"0":"",
"1":"",
"2":"",
"3":"",
"4":"",
"5":"",
"6":"",
"7":"",
"8":"",
"9":"",
"10":"",
"11":"",
"12":"",
"13":"",
"14":"",
"15":"",
"16":"",
"17":"",
"18":"",
"19":"",
"20":"",
"21":"",
"22":"",
"23":"",
"24":"",
"25":"",
"26":"",
"27":"",
"28":"",
"29":"",
"30":"",
"31":"",
"32":"",
"33":"",
"34":"",
"35":"",
"36":"",
"37":"",
"38":"",
"39":"",
"40":"",
"41":"",
"42":"",


In [21]:
renamer = {
"0":"Tumor vasculature 1",
"1":"Tumor vasculature 2",
"2":"Proximal tubule",
"3":"Plasma cells",
"4":"TAM 1",
"5":"Principal cells",
"6":"vSMCs",
"7":"Glomerular endothelium",
"8":"Classical monocytes",
"9":"Mast cells",
"10":"DCT/CNT",
"11":"Mesangial/vSMCs",
"12":"Mast cells",
"13":"Cycling",
"14":"IGHG-high plasma cells",
"15":"Non-classical monocytes",
"16":"DVR",
"17":"Proximal tubule",
"18":"Tumor vasculature 3",
"19":"Tumor cells 1",
"20":"OM Type A-ICs",
"21":"Myofibroblasts",
"22":"Tumor cells 2",
"23":"TAL of LOH",
"24":"Tumor AVR-like vasculature",
"25":"Regulatory T cells",
"26":"TAL of LOH",
"27":"Tumor cells 3",
"28":"Tumor vasculature 4",
"29":"Podocytes",
"30":"TAM 2",
"31":"CD8 T cells",
"32":"Type A-ICs",
"33":"B cells",
"34":"tAL of LOH",
"35":"AVR",
"36":"Type B-IC",
"37":"TAM 3",
"38":"Epithelial progenitor-like cells",
"39":"Cytotoxic T cells",
"40":"TAM 4",
"41":"NK cells",
"42":"Resting/memory T cells" 
}

obs['cell_type'] = [renamer[i] for i in obs['sp_cl_43']]
obs['cell_type']

2                           TAM 4
19                          TAM 3
363                 Tumor cells 2
433                 Tumor cells 3
444           Tumor vasculature 4
                    ...          
4865536                     TAM 2
4865584                     TAM 2
4865642                     TAM 3
4865726             Tumor cells 3
4866011    Resting/memory T cells
Name: cell_type, Length: 50236, dtype: object

# Note: 
for proximal tubule, thick ascending limb of loop of Henle (TAL of LOH) and mast cells there are not one, but two clusters each annotated by these labels, i.e. proximal tubule is assigned to clusters 2 and 17. Since our further analyses do not focus on the aforementioned populations, the subtle differences in expression that were detected by the clustering algorithm were disregarded and the clusters were merged. 

In [22]:
#annotate cell-types to broader classification for re-clustering
for i in obs['cell_type'].unique():
    print('"%s":"",'%i)

"TAM 4":"",
"TAM 3":"",
"Tumor cells 2":"",
"Tumor cells 3":"",
"Tumor vasculature 4":"",
"TAM 1":"",
"CD8 T cells":"",
"TAM 2":"",
"Resting/memory T cells":"",
"Cytotoxic T cells":"",
"Tumor vasculature 2":"",
"Tumor vasculature 1":"",
"Non-classical monocytes":"",
"Mesangial/vSMCs":"",
"IGHG-high plasma cells":"",
"Mast cells":"",
"Tumor vasculature 3":"",
"Plasma cells":"",
"Tumor AVR-like vasculature":"",
"Cycling":"",
"vSMCs":"",
"NK cells":"",
"Classical monocytes":"",
"B cells":"",
"Myofibroblasts":"",
"Regulatory T cells":"",
"Tumor cells 1":"",
"Proximal tubule":"",
"AVR":"",
"Epithelial progenitor-like cells":"",
"tAL of LOH":"",
"Glomerular endothelium":"",
"Principal cells":"",
"TAL of LOH":"",
"DVR":"",
"Type A-ICs":"",
"OM Type A-ICs":"",
"Type B-IC":"",
"Podocytes":"",
"DCT/CNT":"",


In [23]:
renamer = {
"TAM 4":"Immune",
"TAM 3":"Immune",
"Tumor cells 2":"Tumor",
"Tumor cells 3":"Tumor",
"Tumor vasculature 4":"Endothelial",
"TAM 1":"Immune",
"CD8 T cells":"Immune",
"TAM 2":"Immune",
"Resting/memory T cells":"Immune",
"Cytotoxic T cells":"Immune",
"Tumor vasculature 2":"Endothelial",
"Tumor vasculature 1":"Endothelial",
"Non-classical monocytes":"Immune",
"Mesangial/vSMCs":"Stromal",
"IGHG-high plasma cells":"Immune",
"Mast cells":"Immune",
"Tumor vasculature 3":"Endothelial",
"Plasma cells":"Immune",
"Tumor AVR-like vasculature":"Endothelial",
"Cycling":"Cycling",
"vSMCs":"Stromal",
"NK cells":"Immune",
"Classical monocytes":"Immune",
"B cells":"Immune",
"Myofibroblasts":"Stromal",
"Regulatory T cells":"Immune",
"Tumor cells 1":"Tumor",
"Proximal tubule":"Epithelial",
"AVR":"Endothelial",
"Epithelial progenitor-like cells":"Epithelial",
"tAL of LOH":"Epithelial",
"Glomerular endothelium":"Endothelial",
"Principal cells":"Epithelial",
"TAL of LOH":"Epithelial",
"DVR":"Endothelial",
"Type A-ICs":"Epithelial",
"OM Type A-ICs":"Epithelial",
"Type B-IC":"Epithelial",
"Podocytes":"Epithelial",
"DCT/CNT":"Epithelial"
}

obs['broad_cell_type'] = [renamer[i] for i in obs['cell_type']]
obs['broad_cell_type'].unique()

array(['Immune', 'Tumor', 'Endothelial', 'Stromal', 'Cycling',
       'Epithelial'], dtype=object)

In [24]:
obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,sp_cl_38,sp_cl_39,sp_cl_40,sp_cl_42,sp_cl_43,sp_cl_45,sp_cl_47,sp_cl_50,cell_type,broad_cell_type
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,33,20,18,18,40,25,7,19,TAM 4,Immune
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,35,33,32,33,37,34,40,46,TAM 3,Immune
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,21,24,19,21,22,29,21,24,Tumor cells 2,Tumor
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,31,35,34,37,27,38,28,31,Tumor cells 3,Tumor
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,1,37,39,39,28,23,45,36,Tumor vasculature 4,Endothelial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,35,33,28,13,30,1,1,15,TAM 2,Immune
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,33,20,28,13,30,1,1,15,TAM 2,Immune
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,35,33,32,33,37,34,40,15,TAM 3,Immune
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,31,35,34,37,27,38,8,44,Tumor cells 3,Tumor


In [27]:
#remove unused categories

for i in obs.keys():
    if i.startswith('louvain'):
        obs.pop(i)

In [29]:

for i in obs.keys():
    if i.startswith('pheno'):
        obs.pop(i)

In [32]:

for i in obs.keys():
    if i.startswith('sp_cl') and i!='sp_cl_43':
        obs.pop(i)
        

In [34]:

for i in obs.keys():
    if i.startswith('removed'):
        obs.pop(i)

In [36]:
obs.keys()

Index(['library', 'total_counts', 'pct_counts_mito', 'library2', 'sample',
       'patient', 'pT stage', 'seq_date', 'beads', 'operation', 'sex',
       'tumor size, mm', 'age', 'tissue', 'necrosis', 'doublet_score',
       'potential_doublet', 'top3pct_dbtl_score', 'top5pct_dbtl_score',
       'top10pct_dbtl_score', 'closest_JZ_kidney', 'closest_JZ_kidney_hvg',
       'n_counts', 'no_dblt_no_rbc', 'sp_cl_43', 'cell_type',
       'broad_cell_type'],
      dtype='object')

In [37]:
# load current color dictionary
cg0 = srz.read_cell_groupings(project_dir+plot_name+'/categorical_coloring_data.json')


# color dictionary of dictionaries
cdd = {key:value['label_colors'] for key,value in cg0.items()}


In [38]:
cg = {key:list(np.array(value).astype(str)) for key,value in obs.items()}

In [39]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [40]:
fname = 'backups_JZ_2022/annotated_obs_info_%dx%d_%s.npz'%(obs.shape[0],obs.shape[1],rz.now())
print(fname)
rz.save_df(obs,fname)

backups_JZ_2022/annotated_obs_info_50236x27_220503_10h45.npz


  d['descr'] = dtype_to_descr(array.dtype)
