In [1]:
import anndata as ad
import networkx as nx
import numpy as np
import pandas as pd
import scglue
import seaborn as sns
from IPython import display
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix
from networkx.drawing.nx_agraph import graphviz_layout

scglue.plot.set_publication_params()
rcParams['figure.figsize'] = (4, 4)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
rna = ad.read_h5ad("./data/rna-emb.h5ad")
atac = ad.read_h5ad("./data/atac-emb.h5ad")
guidance_hvf = nx.read_graphml("./data/guidance-hvf.graphml.gz")

In [4]:
rna.var["name"] = rna.var_names
atac.var["name"] = atac.var_names
genes = rna.var.query("highly_variable").index
peaks = atac.var.query("highly_variable").index

In [5]:
features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])

skeleton = guidance_hvf.edge_subgraph(
    e for e, attr in dict(guidance_hvf.edges).items()
    if attr["type"] == "fwd"
).copy()

reginf = scglue.genomics.regulatory_inference(
    features, feature_embeddings,
    skeleton=skeleton, random_state=0
)

gene2peak = reginf.edge_subgraph(
    e for e, attr in dict(reginf.edges).items()
    if attr["qval"] < 0.05
)

regulatory_inference: 100%|██████████| 25565/25565 [00:00<00:00, 25890.94it/s]


In [6]:
scglue.genomics.Bed(atac.var).write_bed("peaks.bed", ncols=3)
scglue.genomics.write_links(
    gene2peak,
    scglue.genomics.Bed(rna.var).strand_specific_start_site(),
    scglue.genomics.Bed(atac.var),
    "gene2peak.links", keep_attrs=["score"]
)
%conda install -c bioconda pygenometracks

In [8]:
%%writefile tracks.ini

[Score]
file = gene2peak.links
title = Score
height = 2
color = YlGnBu
compact_arcs_level = 2
use_middle = True
file_type = links

[ATAC]
file = peaks.bed
title = ATAC
display = collapsed
border_color = none
labels = False
file_type = bed

[Genes]
file = ./data/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz
title = Genes
prefered_name = gene_name
height = 4
merge_transcripts = True
labels = True
max_labels = 100
all_labels_inside = True
style = UCSC
file_type = gtf

[x-axis]
fontsize = 12

Overwriting tracks.ini


In [12]:
loc = rna.var.loc["Gad2"]
chrom = loc["chrom"]
chromLen = loc["chromEnd"] - loc["chromStart"]
chromStart = loc["chromStart"] - chromLen
chromEnd = loc["chromEnd"] + chromLen
# !pyGenomeTracks --tracks tracks.ini \
#     --region {chrom}:{chromStart}-{chromEnd} \
#     --outFileName tracks.png 2> /dev/null
!pyGenomeTracks --tracks tracks.ini \
    --region {chrom}:{chromStart}-{chromEnd} \
    --outFileName tracks.png 2> ./error
display.Image("tracks.png")

In [13]:
motif_bed = scglue.genomics.read_bed("JASPAR2022-mm10.bed.gz")
motif_bed.head()

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts
0,GL456210.1,159,171,Zbtb6,.,.,.,.,.,.,.,.
1,GL456210.1,242,253,Osr2,.,.,.,.,.,.,.,.
2,GL456210.1,266,278,Pou2f3,.,.,.,.,.,.,.,.
3,GL456210.1,505,517,Eomes,.,.,.,.,.,.,.,.
4,GL456210.1,507,516,Tbr1,.,.,.,.,.,.,.,.


In [14]:
tfs = pd.Index(motif_bed["name"]).intersection(rna.var_names)
tfs.size

532

In [16]:
rna[:, np.union1d(genes, tfs)].write_loom("rna.loom")
np.savetxt("tfs.txt", tfs, fmt="%s")

AttributeError: Can only use .cat accessor with a 'category' dtype