# Vitessce Widget Tutorial

# Visualization of SNARE-seq dataset

## 1. Import dependencies

We need to import the classes and functions that we will be using from the corresponding packages.

In [None]:
import os
from os.path import join
from urllib.request import urlretrieve
import scipy.io as sio
import pandas as pd
import numpy as np
import scanpy as sc
from anndata import AnnData
import pyBigWig
import bbi
import negspy.coordinates as nc

from vitessce import (
    VitessceConfig,
    Component as cm,
    CoordinationType as ct,
    AnnDataWrapper,
    MultiBigWigWrapper,
)

## 2. Download data

Download the dataset from Chen et al. Nat Biotechnol 2019 http://doi.org/10.1038/s41587-019-0290-0 from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126074

In [None]:
os.makedirs("data", exist_ok=True)

cdna_barcodes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv.gz")
cdna_counts_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.counts.mtx.gz")
cdna_genes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.genes.tsv.gz")

chromatin_barcodes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.barcodes.tsv.gz")
chromatin_counts_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.counts.mtx.gz")
chromatin_peaks_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.peaks.tsv.gz")

geo_base_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE126074&format=file&file='
for filepath in { cdna_barcodes_filepath, cdna_counts_filepath, cdna_genes_filepath, chromatin_barcodes_filepath, chromatin_counts_filepath, chromatin_peaks_filepath }:
    urlretrieve(f'{geo_base_url}{os.path.basename(filepath)}', filepath)

## 3. Read data

In [None]:
cdna_counts_mtx = sio.mmread(cdna_counts_filepath).toarray()
chromatin_counts_mtx = sio.mmread(chromatin_counts_filepath).toarray()

cdna_barcodes_df = pd.read_csv(cdna_barcodes_filepath, sep='\t', header=None, index_col=0)
cdna_genes_df = pd.read_csv(cdna_genes_filepath, sep='\t', header=None, index_col=0)

chromatin_barcodes_df = pd.read_csv(chromatin_barcodes_filepath, sep='\t', header=None)
chromatin_peaks_df = pd.read_csv(chromatin_peaks_filepath, sep='\t', header=None)

In [None]:
# Download cisTopic-based t-SNE and UMAP coordinates.
chromatin_tsne_df = pd.read_csv("https://keller-mark.github.io/vitessce-demo-hosting-temporary/snare-seq-cistopic/chromatin.tsne.tsv", sep='\t', index_col=0)
chromatin_umap_df = pd.read_csv("https://keller-mark.github.io/vitessce-demo-hosting-temporary/snare-seq-cistopic/chromatin.umap.tsv", sep='\t', index_col=0)

## 4. Use transcriptomes to cluster

In [None]:
obs_df = pd.DataFrame(index=cdna_barcodes_df.index)
var_df = pd.DataFrame(index=cdna_genes_df.index)

adata = AnnData(X=cdna_counts_mtx.T, obs=obs_df, var=var_df)

In [None]:
# Filter out genes not detected in at least 3 cells
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
# Count-normalize to 10,000 reads per cell
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.pca(adata, svd_solver='arpack')
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
adata.obsm["chromatin_tsne"] = chromatin_tsne_df.loc[adata.obs.index.values.tolist()].values
adata.obsm["chromatin_umap"] = chromatin_umap_df.loc[adata.obs.index.values.tolist()].values

## 5. Compute chromatin accessibility profiles of clusters

In [None]:
def convert_bin_name_to_chr_name(bin_name):
    try:
        return bin_name[:bin_name.index(':')]
    except ValueError:
        return np.nan
def convert_bin_name_to_chr_start(bin_name):
    try:
        return int(bin_name[bin_name.index(':')+1:bin_name.index('-')])
    except ValueError:
        return np.nan
def convert_bin_name_to_chr_end(bin_name):
    try:
        return int(bin_name[bin_name.index('-')+1:])
    except ValueError:
        return np.nan

chromatin_peaks_df["chr_name"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_name)
chromatin_peaks_df["chr_start"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_start)
chromatin_peaks_df["chr_end"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_end)

chromatin_peaks_df = chromatin_peaks_df.dropna(subset=["chr_name", "chr_start", "chr_end"]).copy()

# Ensure that the columns have the expect types.
chromatin_peaks_df["chr_name"] = chromatin_peaks_df["chr_name"].astype(str)
chromatin_peaks_df["chr_start"] = chromatin_peaks_df["chr_start"].astype(int)
chromatin_peaks_df["chr_end"] = chromatin_peaks_df["chr_end"].astype(int)

In [None]:
chromatin_peaks_df.head()

In [None]:
starting_resolution = 200

In [None]:
chromosomes = nc.get_chromorder('mm10')
chroms_length_arr = np.array([ nc.get_chrominfo('mm10').chrom_lengths[x] for x in chromosomes ], dtype="i8")
chroms_name_arr = np.array(chromosomes, dtype="S23")
chrom_name_to_length = list(zip(chromosomes, chroms_length_arr))

In [None]:
cluster_ids = adata.obs['leiden'].unique().tolist()
cluster_ids.sort(key=int)

## Convert cluster profiles to BigWig files for convenience

In [None]:
for cluster_id in cluster_ids:
    bw = pyBigWig.open(join("data", "bw", f"{cluster_id}.bw"), "w")
    bw.addHeader(chrom_name_to_length)

    cell_tf = adata.obs['leiden'] == cluster_id
    cluster_cell_ids = adata.obs.loc[cell_tf].index.values.tolist()
    for chr_name in chromosomes:
        chr_tf = chromatin_peaks_df["chr_name"] == str(chr_name)
        chr_peaks_df = chromatin_peaks_df.loc[chr_tf]
        if chr_peaks_df.shape[0] > 0:
            chr_names = chr_peaks_df["chr_name"].values.tolist()
            chr_starts = chr_peaks_df["chr_start"].values.tolist()
            chr_ends = chr_peaks_df["chr_end"].values.tolist()
            chr_values = chromatin_counts_mtx[chr_tf,:][:,cell_tf].sum(axis=1).astype(np.double).tolist()
            bw.addEntries(chr_names, chr_starts, ends=chr_ends, values=chr_values)

    bw.close()

In [None]:
bws = [ bbi.open(join("data", "bw", f"{cluster_id}.bw")) for cluster_id in cluster_ids ]
cluster_paths = [ [ "leiden", cluster_id ] for cluster_id in cluster_ids ]

In [None]:
adata

In [None]:
vc = VitessceConfig("SNARE-seq")
dataset = vc.add_dataset("Chen et al.")
dataset = dataset.add_object(AnnDataWrapper(adata, cell_set_obs_cols=['leiden']))
dataset = dataset.add_object(MultiBigWigWrapper(bws, cluster_paths, assembly='mm10'))
X_umap = vc.add_view(dataset, cm.SCATTERPLOT, mapping="X_umap")
X_tsne = vc.add_view(dataset, cm.SCATTERPLOT, mapping="X_tsne")
chr_umap = vc.add_view(dataset, cm.SCATTERPLOT, mapping="chromatin_umap")
cell_sets = vc.add_view(dataset, cm.CELL_SETS)
genome_tracks = vc.add_view(dataset, cm.GENOMIC_PROFILES).set_props(assembly='mm10')
vc.layout(genome_tracks | ((X_umap | chr_umap) / (X_tsne | cell_sets)));

In [None]:
vc.widget()