# scATAC-seq preprcoes



In [None]:
import snapatac2 as snap
snap.__version__


## Load Fragments

In [None]:
# exmpale fragment file, generated by cellranger
fragment_file = "/path/to/fragments.tsv.gz"

In [None]:
# for arabidopsis

chrom_size = {
"Chr1":    30427671,
"Chr2":    19698289,
"Chr3":    23459830,
"Chr4":    18585056,
"Chr5":    26975502,
"ChrM":    366924,
"ChrC":    154478
}

In [None]:
# Import data, and all the changed data are recorded in the file
data = snap.pp.import_data(
    fragment_file,
    chrom_sizes=chrom_size,
    file="scatac.h5ad",  # Optional
    sorted_by_barcode=False,
    chrM=["ChrC", "ChrM"]
)
data

## Data Quality Control


In [None]:
snap.pl.frag_size_distr(data, interactive=False) 

In [None]:
# GTF file for arabidopsis
gtf_file = "/path/to/at_genes.gtf.gz"

# Create snap object
snap.metrics.tsse(data, gtf_file, exclude_chroms=["ChrC","ChrM"])

In [None]:
snap.pl.tsse(data, interactive=False)

In [None]:
# filter cell by tsse plot
snap.pp.filter_cells(data, min_counts=1500, min_tsse=3.2, max_counts=50000)


In [None]:
# fragment information
data.obsm['fragment_paired'].shape

In [None]:
# close the data
data.close()

## Tile X Cell matrix

In [None]:
data = snap.read("scatac.h5ad")

In [None]:
snap.pp.add_tile_matrix(data, bin_size=100, exclude_chroms=["ChrC","ChrM"])

In [None]:
# A new column named "selected" has been created in var.
snap.pp.select_features(data, n_features=250000)

In [None]:
data.var['selected'].value_counts()

In [None]:
snap.tl.spectral(data)
snap.tl.umap(data)
snap.pp.knn(data)
snap.tl.leiden(data)

In [None]:
snap.pl.umap(data, color='leiden', interactive=False, height=500)

In [None]:
# close the data
data.close()

## Gene activity X Cell Matrix

Import the cell-by-bin matrix of single-cell ATAC-seq data and use the snap.pp.make_gene_matrix to generate gene activity matrix.

In [None]:
data = snap.read("scatac")

In [None]:
gtf_file = './at_genes.gtf.gz'
gene_matrix = snap.pp.make_gene_matrix(data, gtf_file, upstream=2000,gene_name_key="gene_id")
gene_matrix

In [None]:
gene_matrix.write("gene_mat.h5ad", compression='gzip')

## Peak X Cell Matrix

In [None]:
data = snap.read("scatac.h5ad")

In [None]:
snap.tl.macs3(data, groupby='leiden')


In [None]:
peaks = snap.tl.merge_peaks(data.uns['macs3'], chrom_size)

In [None]:
peaks.shape

In [None]:
# build peak x cell matrix
peak_mat = snap.pp.make_peak_matrix(data, use_rep=peaks['Peaks'])
peak_mat

In [None]:
# write matrix
peak_mat.write("scatac_peak_mat.h5ad", compression='gzip')


In [None]:
# close the data
data.close()

## scRNA-seq preprocess

This is merely a basic analysis workflow, not necessarily the optimal one. The key lies in annotating the cell types — specifically, having a column named cell_type, which is essential for assisting in the subsequent annotation of scATAC-seq data.

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Set global figure and verbosity parameters
sc.set_figure_params(figsize=(6, 6), dpi=100)
sc.settings.verbosity = 3


In [None]:
# Usually from 10x Genomics format (.h5 or matrix directory)
adata = sc.read_10x_h5("filtered_feature_bc_matrix.h5")  # or sc.read_10x_mtx()
adata.var_names_make_unique()


In [None]:
# Add QC metrics
adata.obs['n_counts'] = adata.X.sum(axis=1).A1  # Total transcript count per cell
adata.obs['n_genes'] = (adata.X > 0).sum(axis=1).A1  # Number of genes expressed per cell
adata.obs['pct_mito'] = (
    adata[:, adata.var_names.str.startswith('ATMG')].X.sum(axis=1).A1 / adata.obs['n_counts']
) * 100  # Mitochondrial gene percentage

# Visualize QC metrics
sc.pl.violin(adata, ['n_counts', 'n_genes', 'pct_mito'], jitter=0.4, multi_panel=True)

# Filter cells and genes
adata = adata[adata.obs['n_genes'] > 200, :]
adata = adata[adata.obs['pct_mito'] < 10, :]


In [None]:
adata.layers["counts"] = adata.X.copy()


In [None]:
# Normalizing to median total counts
sc.pp.normalize_total(adata)
# Logarithmize the data
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
# Using the igraph implementation and a fixed number of iterations can be significantly faster, especially for larger datasets
sc.tl.leiden(adata, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(adata, color=['leiden', 'n_counts', 'pct_mito'])  # Visualize clusters and QC metrics


In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')  # Differential expression per cluster
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)  # Plot top marker genes


In [None]:
# celltype annotation based on marker genes
# this step should be done by human experts

In [None]:
adata.write("scrna.h5ad")
