# Vitessce Widget Tutorial

# Visualization of SNARE-seq dataset

## 1. Import dependencies

We need to import the classes and functions that we will be using from the corresponding packages.

In [15]:
import os
from os.path import join
from urllib.request import urlretrieve
import scipy.io as sio
import pandas as pd
import numpy as np
import scanpy as sc
from anndata import AnnData
import pyBigWig
import bbi
import negspy.coordinates as nc

from vitessce import (
    VitessceConfig,
    Component as cm,
    CoordinationType as ct,
)

## 2. Download data

Download the dataset from Chen et al. Nat Biotechnol 2019 http://doi.org/10.1038/s41587-019-0290-0 from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126074

In [2]:
os.makedirs("data", exist_ok=True)

cdna_barcodes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv.gz")
cdna_counts_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.counts.mtx.gz")
cdna_genes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_cDNA.genes.tsv.gz")

chromatin_barcodes_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.barcodes.tsv.gz")
chromatin_counts_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.counts.mtx.gz")
chromatin_peaks_filepath = join("data", "GSE126074_AdBrainCortex_SNAREseq_chromatin.peaks.tsv.gz")

geo_base_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE126074&format=file&file='
for filepath in { cdna_barcodes_filepath, cdna_counts_filepath, cdna_genes_filepath, chromatin_barcodes_filepath, chromatin_counts_filepath, chromatin_peaks_filepath }:
    urlretrieve(f'{geo_base_url}{os.path.basename(filepath)}', filepath)

## 3. Read data

In [86]:
cdna_counts_mtx = sio.mmread(cdna_counts_filepath).toarray()
chromatin_counts_mtx = sio.mmread(chromatin_counts_filepath).toarray()

cdna_barcodes_df = pd.read_csv(cdna_barcodes_filepath, sep='\t', header=None, index_col=0)
cdna_genes_df = pd.read_csv(cdna_genes_filepath, sep='\t', header=None, index_col=0)

chromatin_barcodes_df = pd.read_csv(chromatin_barcodes_filepath, sep='\t', header=None)
chromatin_peaks_df = pd.read_csv(chromatin_peaks_filepath, sep='\t', header=None)

In [4]:
# Download cisTopic-based t-SNE and UMAP coordinates.
chromatin_tsne_df = pd.read_csv("https://keller-mark.github.io/vitessce-demo-hosting-temporary/snare-seq-cistopic/chromatin.tsne.tsv", sep='\t', index_col=0)
chromatin_umap_df = pd.read_csv("https://keller-mark.github.io/vitessce-demo-hosting-temporary/snare-seq-cistopic/chromatin.umap.tsv", sep='\t', index_col=0)

In [87]:
cdna_barcodes_df.head()

09A_CAGCCCCGCCTT
09A_CGCCTACCATGA
09A_GATGCGCGGCTA
09A_GGTCCGAGTCCT
09A_TCTCCCGGCACC


In [6]:
chromatin_peaks_df.head()

Unnamed: 0,0
0,chr1:3005833-3005982
1,chr1:3094772-3095489
2,chr1:3119556-3120739
3,chr1:3121334-3121696
4,chr1:3134637-3135032


## 4. Use transcriptomes to cluster

In [90]:
obs_df = pd.DataFrame(index=cdna_barcodes_df.index)
var_df = pd.DataFrame(index=cdna_genes_df.index)

adata = AnnData(X=cdna_counts_mtx.T, obs=obs_df, var=var_df)

In [91]:
adata

AnnData object with n_obs × n_vars = 10309 × 33160

## 5. Compute chromatin accessibility profiles of clusters

In [84]:
def convert_bin_name_to_chr_name(bin_name):
    try:
        return bin_name[:bin_name.index(':')]
    except ValueError:
        return np.nan
def convert_bin_name_to_chr_start(bin_name):
    try:
        return int(bin_name[bin_name.index(':')+1:bin_name.index('-')])
    except ValueError:
        return np.nan
def convert_bin_name_to_chr_end(bin_name):
    try:
        return int(bin_name[bin_name.index('-')+1:])
    except ValueError:
        return np.nan

chromatin_peaks_df["chr_name"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_name)
chromatin_peaks_df["chr_start"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_start)
chromatin_peaks_df["chr_end"] = chromatin_peaks_df[0].apply(convert_bin_name_to_chr_end)

chromatin_peaks_df = chromatin_peaks_df.dropna(subset=["chr_name", "chr_start", "chr_end"]).copy()

# Ensure that the columns have the expect types.
chromatin_peaks_df["chr_name"] = chromatin_peaks_df["chr_name"].astype(str)
chromatin_peaks_df["chr_start"] = chromatin_peaks_df["chr_start"].astype(int)
chromatin_peaks_df["chr_end"] = chromatin_peaks_df["chr_end"].astype(int)

In [25]:
chromatin_peaks_df.head()

Unnamed: 0,0,chr_name,chr_start,chr_end
0,chr1:3005833-3005982,chr1,3005833,3005982
1,chr1:3094772-3095489,chr1,3094772,3095489
2,chr1:3119556-3120739,chr1,3119556,3120739
3,chr1:3121334-3121696,chr1,3121334,3121696
4,chr1:3134637-3135032,chr1,3134637,3135032


In [10]:
starting_resolution = 200

In [81]:
chromosomes = nc.get_chromorder('mm10')
chroms_length_arr = np.array([ nc.get_chrominfo('mm10').chrom_lengths[x] for x in chromosomes ], dtype="i8")
chroms_name_arr = np.array(chromosomes, dtype="S23")
chrom_name_to_length = list(zip(chromosomes, chroms_length_arr))

In [82]:
bw = pyBigWig.open(join("data", "bw", "output.bw"), "w")
bw.addHeader(chrom_name_to_length)

for chr_name in chromosomes:
    chr_tf = chromatin_peaks_df["chr_name"] == str(chr_name)
    chr_peaks_df = chromatin_peaks_df.loc[chr_tf]
    if chr_peaks_df.shape[0] > 0:
        chr_names = chr_peaks_df["chr_name"].values.tolist()
        chr_starts = chr_peaks_df["chr_start"].values.tolist()
        chr_ends = chr_peaks_df["chr_end"].values.tolist()
        chr_values = chromatin_counts_mtx[chr_tf,0].astype(np.double).tolist()
        bw.addEntries(chr_names, chr_starts, ends=chr_ends, values=chr_values)

bw.close()