# Regulatory Inference

Regulatory inference based on GLUE is used to identify the important cis - regulatory regions (ATAC peaks) for each gene. 

In [None]:
import scanpy as sc
import networkx as nx
import numpy as np
import pandas as pd
import scglue

import snapatac2 as snap

In [None]:
# load the data
rna = sc.read_h5ad("scrna-emb.h5ad", compression="gzip")
atac = sc.read_h5ad("scatac-emb.h5ad", compression="gzip")
guidance_hvf = nx.read_graphml("guidance-hvf.graphml.gz")

In [None]:
# highly variable genes and peaks
genes = rna.var.query("highly_variable").index
peaks = atac.var.query("highly_variable").index

In [None]:
# Perform cis-regulatory inference using GLUE feature embeddings.
# Merge the feature indices and embeddings from the two modalities.

features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])

In [None]:
# Extract the "skeleton" graph and perform regulatory inference based on it. The role of the skeleton graph is to limit the search space for potential regulatory pairs, 
# which helps to reduce false - positives caused by spurious correlations. 
skeleton = guidance_hvf.edge_subgraph(
    e for e, attr in dict(guidance_hvf.edges).items()
    if attr["type"] == "fwd"
).copy()

In [None]:
# Perform regulatory inference using the scglue.genomics.regulatory_inference function. This function takes feature indices and embeddings as inputs, as well as the skeleton graph generated above. 
reginf = scglue.genomics.regulatory_inference(
    features, feature_embeddings,
    skeleton=skeleton, random_state=0
)

In [None]:
# Important regulatory relationships can be extracted based on edge attributes (Q - value less than 0.05).
gene2peak = reginf.edge_subgraph(
    e for e, attr in dict(reginf.edges).items()
    if attr["qval"] < 0.05
)

In [None]:
# Save the gene and peak correspondence.

link_file = 'gene2peak.links'
scglue.genomics.write_links(
    gene2peak,
    scglue.genomics.Bed(rna.var).strand_specific_start_site(),
    scglue.genomics.Bed(atac.var),
    link_file, keep_attrs=["score"]
)


## Construct a TF-gene regulatory network based on the inferred cis-regulatory regions.

Note: In this tutorial, the guidance graph is constructed solely based on genomic overlaps (see Stage 1), so the inferred regulatory links are limited to proximal promoters and gene body regions.
In practical analyses, it is beneficial to expand the genomic range (e.g., using a distance-decay weighting within 150 kb around the TSS) or incorporate additional information such as Hi-C or eQTL data (see our case study for reference).

In [None]:
# CISBP binding site predicted by FIMO
motif_bed = scglue.genomics.read_bed("ath_cisbp_all.bed") 

In [None]:

# convert geneid to symbol or vice versa
symbol2gene = dict()
gene2symbol = dict()
with open("./gene2symbol.txt") as f:
    for line in f:
        gene_id, symbol = line.strip().split("\t")
        symbol2gene[symbol] = gene_id
        gene2symbol[gene_id] = symbol

In [None]:
motif_bed['name'] = motif_bed['name'].map(symbol2gene)

In [None]:
# select expressed TF
tfs = pd.Index(motif_bed["name"]).intersection(rna.var_names)
tfs.size

np.savetxt("tfs.txt", tfs, fmt="%s")

In [None]:
# pip install loompy
rna.obs['cells'] = rna.obs_names

# only save HVG + TF to loom
rna[:, np.union1d(genes, tfs)].write_loom("rna.loom")


In [None]:
# --gene_attribute : adata.var that store gene name
# --cell_id_attribute : adata.obs that store cell_id
!conda run -n pyscenic pyscenic grn rna.loom tfs.txt \
    -o draft_grn.csv --seed 0 --num_workers 20 \
    --cell_id_attribute cells --gene_attribute name

In [None]:
# Use the scglue.genomics.window_graph function to scan the genome and link ATAC peaks to TF motifs based on genomic overlap.

peak_bed = scglue.genomics.Bed(atac.var.loc[peaks])
peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True)
peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs)

In [None]:
# Specifically, we can use the scglue.genomics.cis_regulatory_ranking function to merge gene–peak and peak–TF links into gene–TF cis-regulatory rankings.
# Since each gene can be associated with a varying number and length of ATAC peaks, the aggregated gene–peak–TF connections are not directly comparable.
# Therefore, this function compares the observed connections with randomly sampled ones (stratified by peak length) to assess enrichment, which is then used to rank target genes for each TF.

gene2tf_rank_glue = scglue.genomics.cis_regulatory_ranking(
    gene2peak, peak2tf, genes, peaks, tfs,
    region_lens=atac.var.loc[peaks, "chromEnd"] - atac.var.loc[peaks, "chromStart"],
    random_state=0
)
gene2tf_rank_glue.iloc[:5, :5]

In [None]:
# Generate TF cis-regulatory rankings using proximal promoters (optional)
# This is similar to the initial approach used in pySCENIC, where regions are extended 500 bp upstream and downstream of the TSS.

flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(1000, 500)
flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True)

In [None]:
gene2flank = nx.Graph([(g, g) for g in genes])
gene2tf_rank_supp = scglue.genomics.cis_regulatory_ranking(
    gene2flank, flank2tf, genes, genes, tfs,
    n_samples=0
)
gene2tf_rank_supp.iloc[:5, :5]

In [None]:
gene2tf_rank_glue.columns = gene2tf_rank_glue.columns + "_glue"
gene2tf_rank_supp.columns = gene2tf_rank_supp.columns + "_supp"

In [None]:
scglue.genomics.write_scenic_feather(gene2tf_rank_glue, "glue.genes_vs_tracks.rankings.feather")
scglue.genomics.write_scenic_feather(gene2tf_rank_supp, "supp.genes_vs_tracks.rankings.feather")

In [None]:
pd.concat([
    pd.DataFrame({
        "#motif_id": tfs + "_glue",
        "gene_name": tfs
    }),
    pd.DataFrame({
        "#motif_id": tfs + "_supp",
        "gene_name": tfs
    })
]).assign(
    motif_similarity_qvalue=0.0,
    orthologous_identity=1.0,
    description="placeholder"
).to_csv("ctx_annotation.tsv", sep="\t", index=False)

In [None]:
# --gene_attribute : adata.var that store gene name
# --cell_id_attribute : adata.obs that store cell_id


!conda run -n pyscenic pyscenic ctx draft_grn.csv \
    glue.genes_vs_tracks.rankings.feather \
    supp.genes_vs_tracks.rankings.feather \
    --annotations_fname ctx_annotation.tsv \
    --expression_mtx_fname rna.loom \
    --output pruned_grn.csv \
    --rank_threshold 500 --min_genes 1 \
    --num_workers 20 \
    --cell_id_attribute cells --gene_attribute name 2> /dev/null

# which will output pruned_grn.csv