In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
import sys
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.8


# Load data

In [3]:
adata = sc.read_h5ad('backups_JZ_2022/kidney_v1_wo_dblt1_batch_corrected_50693x2000_220127_09h59.h5ad') 

In [4]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,top10pct_dbtl_score,closest_JZ_kidney,closest_JZ_kidney_hvg,removed_as_RBC,louvain_resolution_30.0,louvain_resolution_50.0,louvain_resolution_60.0,louvain_resolution_80.0,removed_as_dblt1,n_counts
2,N14,449.0,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,TAM 2,TAM 2,False,253,238,196,213,False,449.0
19,N14,449.0,4.231626,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,TAM 2,TAM 2,False,117,239,331,1009,False,449.0
363,N14,1229.0,10.659073,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,Tumor cells 2,Tumor cells 2,False,31,68,610,507,False,1229.0
433,N14,432.0,6.944445,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,Tumor cells 1,Tumor cells 2,False,8,120,115,112,False,432.0
444,N14,502.0,4.980080,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,Tumor vasculature 2,Tumor vasculature 2,False,12,153,265,257,False,502.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683.0,9.370424,T4_old,T4,P4,pT3a,old,old,Open,...,False,TAM 1,TAM 4,False,66,284,333,162,False,683.0
4865584,Tumor0228,1473.0,6.517312,T4_old,T4,P4,pT3a,old,old,Open,...,True,TAM 1,TAM 4,False,172,266,228,388,False,1473.0
4865642,Tumor0228,498.0,9.437751,T4_old,T4,P4,pT3a,old,old,Open,...,False,TAM 4,TAM 4,False,20,221,299,203,False,498.0
4865726,Tumor0228,421.0,19.239906,T4_old,T4,P4,pT3a,old,old,Open,...,False,Mito high TAM/tumor cells,Mito high TAM/tumor cells,False,7,92,64,78,False,421.0


In [5]:
# load intermediates I saved when preparing the SPRING plot
path1 = '/Users/justina/Documents/mokslai/MAGISTRAS/MAGISTRINIS/data_and_spring/SPRING_dev-master/kidney_spring/' 
project_dir = path1+'kidney_2022/'
plot_name =  'kidney_v1_wo_dblt1'


params = rz.load_stuff(project_dir+plot_name+'/params.pickle')
params.keys()

dict_keys(['k', 'cell_mask', 'min_counts', 'min_cells', 'base_ix', 'num_pc', 'plot_name', 'embedding', 'gene_names_excluded', 'abundant_gene_mask', 'v_score_dict', 'nr_var_genes', 'genes_used', 'eigenvectors', 'eigenvalues', 'neighbors', 'min_dist'])

In [6]:
cell_mask = params['cell_mask']

In [7]:
len(cell_mask)

51196

In [8]:
adata.obsp['connectivities']

<50693x50693 sparse matrix of type '<class 'numpy.float32'>'
	with 1657848 stored elements in Compressed Sparse Row format>

In [9]:
G  = adata.obsp['connectivities']

# Louvain clustering (choose "resolution")

In [10]:
# start a cell grouping dictionary:
cg = {}

# use scanpy's Louvain clustering to get Louvain clusters at various resolution
#using ultra-high values to cluster the doublets
mock = adata.copy()
for i in [ 50, 60
]:
    l = list(sc.tl.louvain(mock, resolution = i, adjacency = G, copy = True).obs['louvain'])
    cg['louvain_resolution_%.1f'%i] = l
    print(len(set(l)))

598
733


# Append result to the same SPRING plot

In [12]:
# load current color dictionary
cg0 = srz.read_cell_groupings(project_dir+plot_name+'/categorical_coloring_data.json')


# color dictionary of dictionaries
cdd = {key:value['label_colors'] for key,value in cg0.items()}


In [13]:
cg0.keys()

dict_keys(['age', 'beads', 'closest_JZ_kidney', 'closest_JZ_kidney_hvg', 'library', 'library2', 'necrosis', 'operation', 'pT stage', 'patient', 'potential_doublet', 'sample', 'seq_date', 'sex', 'tissue', 'top10pct_dbtl_score', 'top3pct_dbtl_score', 'top5pct_dbtl_score', 'tumor size, mm'])

In [14]:
cg = {key:list(np.array(value).astype(str)) for key,value in cg.items()}

In [15]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [17]:
for key, value in cg.items():
    adata.obs[key] = cg[key]

In [18]:
obs = adata.obs

In [23]:
#removing redundant labels remaining from previous round of clustering 
obs.pop('louvain_resolution_30.0')
obs.pop('louvain_resolution_80.0')

2           213
19         1009
363         507
433         112
444         257
           ... 
4865536     162
4865584     388
4865642     203
4865726      78
4866011     114
Name: louvain_resolution_80.0, Length: 50693, dtype: category
Categories (1081, object): ['0', '1', '2', '3', ..., '1077', '1078', '1079', '1080']

In [25]:
#save obs with clusters

fname = 'backups_JZ_2022/%s_clust_obs_info_%dx%d_%s'%(plot_name, obs.shape[0],obs.shape[1],rz.now())
print(fname)
rz.save_df(obs,fname)

backups_JZ_2022/kidney_v1_wo_dblt1_clust_obs_info_50693x27_220127_10h54


  d['descr'] = dtype_to_descr(array.dtype)


In [26]:
# append categorical colortrack for the spring plot as well
plot_name= plot_name + '_spring'
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)