In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
import sys
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.12


## Load data

In [3]:
# overwrite obs with the most recent version
filename = 'backups_JZ_2022/umap_cell_mask_obs_info_51196x31_220128_15h38.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)

In [4]:
obs['no_dblt_no_rbc'].sum()

50236

In [5]:
# load intermediates I saved when preparing the SPRING plot
path1 = '/Users/justina/Documents/mokslai/MAGISTRAS/MAGISTRINIS/data_and_spring/SPRING_dev-master/kidney_spring/' 
project_dir = path1+'kidney_2022/'
plot_name =  'clean_kidney_304_15tr_25c'


params = rz.load_stuff(project_dir+plot_name+'/params.pickle')
params.keys()

dict_keys(['k', 'cell_mask', 'min_counts', 'min_cells', 'base_ix', 'num_pc', 'plot_name', 'embedding', 'gene_names_excluded', 'abundant_gene_mask', 'v_score_dict', 'nr_var_genes', 'genes_used', 'eigenvectors', 'eigenvalues', 'neighbors', 'min_dist'])

In [6]:
cell_mask = params['cell_mask']

In [7]:
sum(cell_mask)

50236

In [15]:
#the umap and graph data is saved in adata file
adatag = sc.read_h5ad('backups_JZ_2022/clean_kidney_304_15tr_25c_batch_corrected_50236x2000_220315_12h14.h5ad')


In [16]:
adatag

AnnData object with n_obs × n_vars = 50236 × 2000
    obs: 'library', 'total_counts', 'pct_counts_mito', 'library2', 'sample', 'patient', 'pT stage', 'seq_date', 'beads', 'operation', 'sex', 'tumor size, mm', 'age', 'tissue', 'necrosis', 'doublet_score', 'potential_doublet', 'top3pct_dbtl_score', 'top5pct_dbtl_score', 'top10pct_dbtl_score', 'closest_JZ_kidney', 'closest_JZ_kidney_hvg', 'removed_as_RBC', 'removed_as_dblt1', 'removed_as_dblt2', 'n_counts', 'no_dblt_no_rbc'
    var: 'mean', 'std'
    uns: 'X_lin_cptt', 'X_log_z', 'beads_colors', 'draw_graph', 'neighbors', 'pca', 'sample_colors', 'seq_date_colors', 'tissue_colors', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'X_lin_cptt', 'X_log_z'
    obsp: 'connectivities', 'distances'

In [22]:
adatag.X = adatag.layers['X_lin_cptt']

In [23]:
adatag.obs = obs[cell_mask]

In [24]:
adatag.obsp['connectivities']

<50236x50236 sparse matrix of type '<class 'numpy.float32'>'
	with 2433328 stored elements in Compressed Sparse Row format>

In [25]:
G  = adatag.obsp['connectivities'] #selecting the graph

# Louvain clustering

In [26]:
# start a cell grouping dictionary:
cg = {}
# use scanpy's Louvain clustering to get Louvain clusters at various resolution
mock = adatag.copy()
for i in [ 1, 2, 3, 4, 5
]:
    sc.tl.louvain(mock, resolution = i, adjacency = G, copy = False)
    l = list(mock.obs['louvain'])
    cg['louvain_res_%.1f'%i] = l
    print(len(set(l)))

16
25
35
47
55


# Phenograph clustering

In [27]:
# Clustering using PhenoGraph

for i in [0.5, 0.8, 1, 1.5, 2, 2.5, 3, 3.5, 4]:
    sc.external.tl.phenograph(adatag, clustering_algo='leiden', k=30, jaccard=True, primary_metric='euclidean', 
                          resolution_parameter = i)
    l = list(adatag.obs['pheno_leiden'])
    cg['pheno_leiden_res_%.1f'%i] = l

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 71.14062809944153 seconds
Jaccard graph constructed in 5.018240928649902 seconds
Running Leiden optimization
Leiden completed in 5.001922845840454 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 83.00705409049988 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 73.17118883132935 seconds
Jaccard graph constructed in 4.72929573059082 seconds
Running Leiden optimization
Leiden completed in 5.1340508460998535 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 84.9206268787384 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 73.06194829940796 seconds
Jaccard graph constructed in 4.882573127746582 seconds
Running Leiden optimization
Leiden completed in 7.755945205688477 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 87.58202123641968 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 72.45927810668945 seconds
Jaccard graph constructed in 4.882083892822266 seconds
Running Leiden optimization
Leiden completed in 4.966241121292114 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 84.21661424636841 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 71.82721400260925 seconds
Jaccard graph constructed in 4.72184681892395 seconds
Running Leiden optimization
Leiden completed in 5.769881010055542 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 84.1916151046753 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 73.88314914703369 seconds
Jaccard graph constructed in 4.7558770179748535 seconds
Running Leiden optimization
Leiden completed in 6.899049997329712 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 87.37063407897949 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 72.07780718803406 seconds
Jaccard graph constructed in 4.726381778717041 seconds
Running Leiden optimization
Leiden completed in 7.963429927825928 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 86.55843925476074 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 70.97311067581177 seconds
Jaccard graph constructed in 4.783808946609497 seconds
Running Leiden optimization
Leiden completed in 5.747808933258057 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 83.4060606956482 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


Neighbors computed in 74.08822393417358 seconds
Jaccard graph constructed in 4.615707159042358 seconds
Running Leiden optimization
Leiden completed in 8.42387580871582 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 89.01926493644714 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs[comm_key] = pd.Categorical(communities)


# Spectral clustering (choose number of clusters)

In [29]:
# this one is much slower
for i in [
    38,39, 40, 42,43, 45,47, 50]:
    key = 'sp_cl_%d'%(i)
    print(key)
    cg[key] = list(srz.spec_clust(G,i).astype(str))
    print(key)

sp_cl_38
sp_cl_38
sp_cl_39
sp_cl_39
sp_cl_40
sp_cl_40
sp_cl_42
sp_cl_42
sp_cl_43
sp_cl_43
sp_cl_45
sp_cl_45
sp_cl_47
sp_cl_47
sp_cl_50
sp_cl_50


# Append result to the same SPRING plot

In [30]:
# load current color dictionary
cg0 = srz.read_cell_groupings(project_dir+plot_name+'/categorical_coloring_data.json')


# color dictionary of dictionaries
cdd = {key:value['label_colors'] for key,value in cg0.items()}


In [31]:
cg = {key:list(np.array(value).astype(str)) for key,value in cg.items()}

In [32]:
# append categorical colortrack
srz.append_cell_groupings(project_dir+plot_name,cg,colordd=cdd)

In [33]:
adatag.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,removed_as_RBC,louvain_resolution_30.0,louvain_resolution_50.0,louvain_resolution_60.0,louvain_resolution_80.0,removed_as_dblt1,removed_as_dblt2,n_counts,no_dblt_no_rbc,pheno_leiden
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,253,238,196,213,False,False,449,True,9
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,117,239,331,1009,False,False,449,True,9
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,31,68,610,507,False,False,1229,True,11
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,8,120,115,112,False,False,432,True,11
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,12,153,265,257,False,False,502,True,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,False,66,284,333,162,False,False,683,True,12
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,False,172,266,228,388,False,False,1473,True,12
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,False,20,221,299,203,False,False,498,True,12
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,False,7,92,64,78,False,False,421,True,56


In [34]:
for key, value in cg.items():
    adatag.obs[key] = cg[key]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adatag.obs[key] = cg[key]


In [35]:
obs1 = adatag.obs

In [44]:
#save obs with clusters

fname = 'backups_JZ_2022/clustering_304_15tr_25c_obs_info_%dx%d_%s'%(obs1.shape[0],obs1.shape[1],rz.now())
print(fname)
rz.save_df(obs1,fname)

backups_JZ_2022/clustering_304_15tr_25c_obs_info_50236x54_220315_18h11
