In [3]:
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp

In [4]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

-----
anndata     0.11.3
scanpy      1.10.4
-----
PIL                         11.1.0
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.17.0
certifi                     2023.07.22
cffi                        1.17.1
charset_normalizer          3.2.0
cloudpickle                 3.1.1
colorama                    0.4.6
comm                        0.2.2
cycler                      0.12.1
cython_runtime              NA
cytoolz                     1.0.1
dask                        2024.2.1
dateutil                    2.9.0.post0
debugpy                     1.8.12
decorator                   5.1.1
defusedxml                  0.7.1
exceptiongroup              1.2.2
executing                   2.2.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.12.1
idna                        3.4
ipykernel                  

## Preprocessing

In [5]:
adata=sc.read("/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/NCC_analysis_processed.h5ad")

In [6]:
adata

AnnData object with n_obs × n_vars = 6788 × 30717
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_id', 'UMI_count', 'gene_count', 'major_trajectory', 'celltype_update', 'UMAP_1', 'UMAP_2', 'UMAP_3', 'UMAP_2d_1', 'UMAP_2d_2', 'terminal_state', 'nCount_intron', 'nFeature_intron'
    var: 'vf_vst_counts_mean', 'vf_vst_counts_variance', 'vf_vst_counts_variance.expected', 'vf_vst_counts_variance.standardized', 'vf_vst_counts_variable', 'vf_vst_counts_rank', 'var.features', 'var.features.rank'
    obsm: 'X_pca', 'X_umap'

In [7]:
adata=sc.AnnData(adata.X,obs=adata.obs,var=adata.var)
adata.var["Gene"]=adata.var_names
adata.obs["CellID"]=adata.obs_names

In [6]:
adata.write_loom("/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/adata.loom")

## SCENIC step

In [8]:
f_loom_path_scenic = "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/adata.loom"
f_tfs = "/home/icb/weixu.wang/regulatory_velo/pancreas_dataset/allTFs_mm.txt"

In [None]:
!pyscenic grn {f_loom_path_scenic} {f_tfs} -o "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/adj.csv" --num_workers 24


2025-02-26 22:52:46,146 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2025-02-26 22:52:48,844 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
24 partitions
computing dask graph
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [9]:
import glob
# ranking databases
f_db_glob = "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/scale_zebrafish/scenic/cisTarget_databases/*feather"
f_db_names = ' '.join( glob.glob(f_db_glob) )

# motif databases
f_motif_path = "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/scale_zebrafish/scenic/cisTarget_databases/motifs-v9-nr.mgi-m0.001-o0.0.tbl"

In [None]:
!pyscenic ctx "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/adj.csv" \
    {f_db_names} \
    --annotations_fname {f_motif_path} \
    --expression_mtx_fname {f_loom_path_scenic} \
    --output "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/reg.csv" \
    --all_modules \
    --num_workers 24

In [None]:
f_pyscenic_output = "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/pyscenic_output_all_regulon_no_mask.loom"

In [None]:
!pyscenic aucell \
    {f_loom_path_scenic} \
    "/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/reg.csv" \
    --output {f_pyscenic_output} \
    --num_workers 4

In [26]:
import loompy as lp
import pandas as pd
lf = lp.connect(f_pyscenic_output, mode='r+', validate=False )
#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID)
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
regulons = lf.ra.Regulons

In [27]:
res=pd.concat([pd.Series(r.tolist(),index=regulons.dtype.names) for r in regulons],axis=1)

In [28]:
res.columns=lf.row_attrs["SYMBOL"]

In [31]:
res.to_csv("/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/regulon_mat_all_regulons_no_mask.csv")

## Preprocessing GRN

In [76]:
reg = pd.read_csv("/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/regulon_mat_all_regulons.csv",index_col = 0)

reg.index = reg.index.str.extract(r"(\w+)")[0]
reg = reg.groupby(reg.index).sum()
reg[reg != 0] = 1

TF = np.unique(list(map(lambda x: x.split("(")[0],reg.index.tolist())))
genes = np.unique(TF.tolist()+reg.columns.tolist())

GRN = pd.DataFrame(0,index = genes,columns = genes)
GRN.loc[TF,reg.columns.tolist()] = np.array(reg)

mask = (GRN.sum(0) != 0) | (GRN.sum(1) != 0)
GRN = GRN.loc[mask,mask].copy()

GRN.to_parquet("/lustre/groups/ml01/workspace/weixu.wang/regvelo_revision/10x_shallow_NCC/regulon_mat_processed_all_regulons.parquet")
print("Done! processed GRN with "+str(reg.shape[0])+" TF and "+str(reg.shape[1])+" targets")

Done! processed GRN with 543 TF and 30717 targets
