In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
import sys
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.8


# Load data

In [4]:
adata = sc.read_h5ad('backups_JZ_2022/kidney_v0_156_v2000_batch_corrected_51196x2000_220121_15h23.h5ad') 

# overwrite obs with the most recent version
filename = 'backups_JZ_2022/class_obs_info_51196x22_220125_16h56.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)
adata.obs = obs

In [5]:
# load intermediates I saved when preparing the SPRING plot
path1 = '/Users/justina/Documents/mokslai/MAGISTRAS/MAGISTRINIS/data_and_spring/SPRING_dev-master/kidney_spring/' 
project_dir = path1+'kidney_2022/'
plot_name =  'kidney_v0_156_v2000_spring'


params = rz.load_stuff(project_dir+plot_name+'/params.pickle')
params.keys()

dict_keys(['k', 'cell_mask', 'min_counts', 'min_cells', 'base_ix', 'num_pc', 'plot_name', 'embedding', 'gene_names_excluded', 'abundant_gene_mask', 'v_score_dict', 'nr_var_genes', 'genes_used', 'eigenvectors', 'eigenvalues', 'neighbors', 'min_dist'])

In [6]:
cell_mask = params['cell_mask']

In [7]:
cdata = adata[cell_mask] #applying the same cell mask as used for the graph, here not necessary as all cells were used

In [9]:
G  = cdata.obsp['connectivities'] #the scanpy adjacency matrix is stored here

# Removing RBCs

We will assume that all cells that have more than 1% of total raw counts coming from hemoglobin genes *HBB, HBA1, HBA2, HBD* are RBCs

In [11]:
bdata = cdata.raw.to_adata()

In [12]:
print(bdata.X[:5,:].sum(axis=1)) #not normalized

[[ 449.]
 [ 449.]
 [1229.]
 [ 732.]
 [ 432.]]


In [13]:
#lets' check hemoglobin gene fraction in each cell
genes = ['HBB', 'HBA1', 'HBA2', 'HBD']
hem_counts = bdata[:,:][:,np.in1d(bdata.var_names,genes)].X.sum(axis=1)
total_counts = bdata.X.sum(axis=1)
fraction = (hem_counts/total_counts)*100 #percent

fraction

matrix([[0.        ],
        [0.        ],
        [0.        ],
        ...,
        [0.20080321],
        [0.        ],
        [0.1310616 ]])

In [14]:
fraction = np.array(fraction)

#selecting only the cells that have hemoglobin count fraction of over 1%
tresh = 1

over_tresh = fraction>tresh
over_tresh.sum() #that many cells will be removed - not bad

47

In [15]:
#get indexes of RBCs
rbc_index = np.where(fraction>tresh)[0]

In [16]:
rbc_index

array([  371,   863,  3185,  4287,  7092,  7094,  9183,  9682,  9764,
       12514, 12789, 12946, 13207, 13414, 18275, 18418, 18870, 19181,
       19214, 19517, 19975, 20128, 20951, 21379, 21424, 22309, 22634,
       23054, 23416, 24250, 24693, 24975, 25107, 25504, 25524, 27525,
       29382, 31323, 31822, 32406, 33967, 43906, 46639, 47054, 48281,
       48626, 48631])

In [17]:
#creating a boolean mask for RBC removal
rbc_mask = np.ones(cdata.shape[0], dtype = bool)
rbc_mask[rbc_index] = False
rbc_mask.sum()

51149

In [18]:
#recording the mask in obs but not applying now
bdata.obs['removed_as_RBC'] = ~rbc_mask

In [19]:
bdata.obs['removed_as_RBC'].sum()

47

In [20]:
#starting a cell grouping dictionary which will be used to append
#to the plot in SPRING application
cg = bdata.obs[[i for i in bdata.obs.columns if i.startswith(('removed_as_R'))]].astype(str).to_dict(orient='list')

In [21]:
#save obs
fname = 'backups_JZ_2022/no_RBC_%dx%d_%s'%(bdata.obs.shape[0],bdata.obs.shape[1],rz.now())
rz.save_df(bdata.obs,fname)
#print(fname)

In [22]:
cdata.obs = bdata.obs

# Louvain clustering (choose "resolution")

In [24]:
# using scanpy's Louvain clustering to get Louvain clusters at various resolution
#using ultra-high values to cluster the doublets expecting there will not be a lot of them
# and they will cluster together in these small clusters

mock = cdata.copy()
for i in [30, 50, 60, 80
]:
    l = list(sc.tl.louvain(mock, resolution = i, adjacency = G, copy = True).obs['louvain'])
    cg['louvain_resolution_%.1f'%i] = l
    print(len(set(l)))

390
669
804
1081


# Append result to the same SPRING plot

In [25]:
# load current color dictionary
cg0 = srz.read_cell_groupings(project_dir+plot_name+'/categorical_coloring_data.json')


# color dictionary of dictionaries
cdd = {key:value['label_colors'] for key,value in cg0.items()}


In [26]:
cg = {key:list(np.array(value).astype(str)) for key,value in cg.items()}

In [27]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [28]:
cg.keys()

dict_keys(['removed_as_RBC', 'louvain_resolution_30.0', 'louvain_resolution_50.0', 'louvain_resolution_60.0', 'louvain_resolution_80.0'])

In [29]:
#adding for the other (UMAP) plot too
plot_name =  'kidney_v0_156_v2000'

In [30]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [32]:
for key, value in cg.items():
    cdata.obs[key] = cg[key]

In [33]:
obs = cdata.obs

In [35]:
#save obs with clusters

fname = 'backups_JZ_2022/scrub_cl_obs_info_%dx%d_%s'%(obs.shape[0],obs.shape[1],rz.now())
print(fname)
rz.save_df(obs,fname)

backups_JZ_2022/scrub_cl_obs_info_51196x27_220125_18h23


  d['descr'] = dtype_to_descr(array.dtype)
