In [None]:
import anndata as ad
import scanpy as sc
import networkx as nx
import numpy as np
import pandas as pd
import scglue
import seaborn as sns
from IPython import display
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix
from networkx.drawing.nx_agraph import graphviz_layout

import snapatac2 as snap

In [None]:
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

## build guidacne

scrna.h5ad and scatac_peak_mat.h5ad  create by 01.scatac_preprocess.ipynb



In [None]:
rna = sc.read_h5ad("scrna.h5ad")
atac = sc.read_h5ad("scatac_peak_mat.h5ad")

In [None]:
gtf_path = "at-shoot-scATAC/at_genes.gtf.gz" # Must have gene_id attribute in gtf file

scglue.data.get_gene_annotation(
    rna, gtf=gtf_path,
    gtf_by="gene_id"
)
rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()

In [None]:
snap.tl.spectral(atac,features=None)

In [None]:
snap.tl.umap(atac)

In [None]:
sc.pl.umap(atac, color='leiden', legend_loc = 'on data')

In [None]:
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
atac.var.head()

In [None]:
# delete the rows with no strand
new_loc =  rna.var['strand'].isin(["+","-"])
rna = rna[ :, new_loc]

In [None]:
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
guidance

In [None]:
# check the guidance
scglue.graph.check_graph(guidance, [rna, atac])

In [None]:
# save the processed data
rna.write("scrna-pp.h5ad", compression="gzip")
atac.write("scatac-pp.h5ad", compression="gzip")
nx.write_graphml(guidance, "guidance.graphml.gz")

## Train the model

In [None]:

# Load the processed data
rna = sc.read_h5ad("scrna-pp.h5ad")
atac = sc.read_h5ad("scatac-pp.h5ad")
guidance = nx.read_graphml("guidance.graphml.gz")



In [None]:
# use_batch if the data is from different batches

use_batch = "batch"
cell_type = "cell_type"

scglue.models.configure_dataset(
    rna, "NB", use_highly_variable=True,
    use_layer="counts", 
    use_batch = use_batch,
    use_rep="X_pca",
    use_cell_type = cell_type
)



In [None]:
scglue.models.configure_dataset(
    atac, "NB", use_highly_variable=True,
    use_rep="X_spectral"
)

In [None]:
from itertools import chain
guidance_hvf = guidance.subgraph(chain(
    rna.var.query("highly_variable").index,
    atac.var.query("highly_variable").index
)).copy()

In [None]:
glue = scglue.models.fit_SCGLUE(
    {"rna": rna, "atac": atac}, guidance_hvf,
    fit_kws={"directory": "glue"}
)

In [None]:
dx = scglue.models.integration_consistency(
    glue, {"rna": rna, "atac": atac}, guidance_hvf
)
dx

In [None]:
# The higher the curve, the more reliable the integration is considered to be. Based on empirical observations, if the curve rises above the 0.05 line, the integration can generally be regarded as reliable.
_ = sns.lineplot(x="n_meta", y="consistency", data=dx).axhline(y=0.05, c="darkred", ls="--")

In [None]:
# Use the encode_data method to project single-cell omics data into the cell embedding space.
# The first argument of encode_data specifies the domain to be encoded (i.e., one of the previously defined domain names).
# The second argument specifies the dataset to be encoded.
# By convention, the resulting cell embeddings are stored in the obsm slot under the name "X_glue".

rna.obsm["X_glue"] = glue.encode_data("rna", rna)
atac.obsm["X_glue"] = glue.encode_data("atac", atac)

In [None]:
rna.obs['modal'] = "scRNA-seq"
atac.obs['modal'] = "scATAC-seq"

In [None]:
if 'cell_type' not in rna.obs.columns:
    print("Warning: 'cell_type' column not found in rna.obs")
    print("Available columns:", list(rna.obs.columns))
else:
    print("'cell_type' column found in rna.obs")


In [None]:
# Set default cell_type for ATAC data
if 'cell_type' not in atac.obs.columns:
    atac.obs['cell_type'] = "unknown"
    print("Added 'cell_type' column to atac.obs with default value 'unknown'")
else:
    print("'cell_type' column already exists in atac.obs")


In [None]:
combined = ad.concat([rna, atac])

In [None]:
sc.pp.neighbors(combined, use_rep="X_glue", metric="cosine")
sc.tl.umap(combined)

In [None]:
sc.pl.umap(combined, color=["cell_type", "modal"], wspace=0.65)

In [None]:
# save the combined data
combined.write("combined.h5ad", compression="gzip")


In [None]:
# get the feature embedding information
feature_embeddings = glue.encode_graph(guidance_hvf)
feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
feature_embeddings.iloc[:5, :5]

In [None]:
rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()

In [None]:
# save the result
rna.write("scrna-emb.h5ad", compression="gzip")
atac.write("scatac-emb.h5ad", compression="gzip")
nx.write_graphml(guidance_hvf, "guidance-hvf.graphml.gz")

## Transfer the scRNA-seq label to scATAC-seq by KNN

In [None]:
import scanpy as sc
import scglue


In [None]:
adata = sc.read_h5ad("combined.h5ad")

In [None]:
rna = adata[adata.obs['modal'] == 'scRNA-seq'].copy()
atac = adata[adata.obs['modal'] == 'scATAC-seq'].copy()

In [None]:
scglue.data.transfer_labels(ref=rna, query=atac, 
                            field='cell_type', use_rep='X_glue'
                            )



In [None]:
sc.pl.umap(atac, color=['modal', 'cell_type'])

In [None]:
# save annotated data
atac.obs.to_csv("scatac-predicted-cell-type.csv")
