Data from https://www.10xgenomics.com/resources/datasets/frozen-human-healthy-brain-tissue-3-k-1-standard-2-0-0

Using the following outputs from Cell Ranger ARC:
- [Filtered feature barcode matrix MEX (DIR)](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/output/matrices): `data/human_brain_3k_filtered_feature_bc_matrix.tar.gz`
- [ATAC smoothed transposition site track (BIGWIG)](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/output/bigwig): `data/human_brain_3k_atac_cut_sites.bigwig`
- [Secondary analysis outputs (DIR)](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/output/analysis): `data/human_brain_3k_analysis.tar.gz`

In [None]:
!tar -xvzf data/human_brain_3k_filtered_feature_bc_matrix.tar.gz -C data

In [None]:
!tar -xvzf data/human_brain_3k_analysis.tar.gz -C data

In [14]:
import scipy.io
import pandas as pd
import numpy as np
from anndata import AnnData
from mudata import MuData
import scanpy as sc
import muon as mu
from os.path import join
from vitessce.data_utils import VAR_CHUNK_SIZE

In [3]:
matrix_dir = "./data/filtered_feature_bc_matrix"
features_path = join(matrix_dir, "features.tsv.gz")
barcodes_path = join(matrix_dir, "barcodes.tsv.gz")

In [4]:
mat = scipy.io.mmread(join(matrix_dir, "matrix.mtx.gz"))

#feature_ids = [row[0] for row in csv.reader(gzip.open(features_path), delimiter="\t")]
#gene_names = [row[1] for row in csv.reader(gzip.open(features_path), delimiter="\t")]
#feature_types = [row[2] for row in csv.reader(gzip.open(features_path), delimiter="\t")]
#barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path), delimiter="\t")]

In [5]:
mat_dense = mat.todense().T

In [6]:
feature_ids = pd.read_csv(features_path, delimiter="\t", header=None)
feature_ids = feature_ids.rename(columns={0: "feature_id", 1: "gene_name", 2: "feature_type", 3: "chr", 4: "chr_start", 5: "chr_end"})

In [7]:
feature_ids

Unnamed: 0,feature_id,gene_name,feature_type,chr,chr_start,chr_end
0,ENSG00000243485,MIR1302-2HG,Gene Expression,chr1,29553,30267
1,ENSG00000237613,FAM138A,Gene Expression,chr1,36080,36081
2,ENSG00000186092,OR4F5,Gene Expression,chr1,65418,69055
3,ENSG00000238009,AL627309.1,Gene Expression,chr1,120931,133723
4,ENSG00000239945,AL627309.3,Gene Expression,chr1,91104,91105
...,...,...,...,...,...,...
170626,KI270713.1:21467-22401,KI270713.1:21467-22401,Peaks,KI270713.1,21467,22401
170627,KI270713.1:25956-26766,KI270713.1:25956-26766,Peaks,KI270713.1,25956,26766
170628,KI270713.1:29714-30467,KI270713.1:29714-30467,Peaks,KI270713.1,29714,30467
170629,KI270713.1:31270-32183,KI270713.1:31270-32183,Peaks,KI270713.1,31270,32183


In [8]:
barcodes = pd.read_csv(barcodes_path, delimiter="\t", header=None)
barcodes = barcodes.rename(columns={0: "barcode_id"})
barcodes = barcodes.set_index("barcode_id")
barcodes

AAACAGCCATAGACTT-1
AAACAGCCATTATGCG-1
AAACCAACATAGACCC-1
AAACCGAAGATGCCTG-1
AAACCGAAGTTAGCTA-1
...
TTTGTGGCAAGGAATC-1
TTTGTGGCATGCTTAG-1
TTTGTGTTCGTTACAA-1
TTTGTTGGTGATCAGC-1
TTTGTTGGTGATTTGG-1


In [9]:
feature_ids.shape

(170631, 6)

In [10]:
barcodes.shape

(3233, 0)

In [11]:
mat_dense.shape

(3233, 170631)

In [12]:
analysis_dir = "./data/analysis"

In [13]:
rna_umap_df = pd.read_csv(join(analysis_dir, "dimensionality_reduction", "gex", "umap_projection.csv"), index_col=0)
rna_umap_df

Unnamed: 0_level_0,UMAP-1,UMAP-2
Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACAGCCATAGACTT-1,-0.273889,9.657905
AAACAGCCATTATGCG-1,3.748773,0.629261
AAACCAACATAGACCC-1,4.531679,1.283160
AAACCGAAGATGCCTG-1,-4.961755,-6.963626
AAACCGAAGTTAGCTA-1,8.089037,-4.640206
...,...,...
TTTGTGGCAAGGAATC-1,-11.057120,1.212038
TTTGTGGCATGCTTAG-1,4.799424,0.946893
TTTGTGTTCGTTACAA-1,-4.835887,0.916090
TTTGTTGGTGATCAGC-1,8.860142,-6.243071


In [16]:
atac_umap_df = pd.read_csv(join(analysis_dir, "dimensionality_reduction", "atac", "umap_projection.csv"), index_col=0)
atac_umap_df

Unnamed: 0_level_0,UMAP-1,UMAP-2
Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACAGCCATAGACTT-1,9.978963,-2.076166
AAACAGCCATTATGCG-1,7.497211,-0.688258
AAACCAACATAGACCC-1,8.021842,0.946582
AAACCGAAGATGCCTG-1,-6.601482,-1.949825
AAACCGAAGTTAGCTA-1,8.294034,-0.011516
...,...,...
TTTGTGGCAAGGAATC-1,-9.765430,2.593243
TTTGTGGCATGCTTAG-1,10.152254,-2.931405
TTTGTGTTCGTTACAA-1,-10.551735,5.741568
TTTGTTGGTGATCAGC-1,7.540642,1.111266


In [17]:
is_rna = feature_ids["feature_type"] == "Gene Expression"
is_atac = ~is_rna

In [18]:
rna_obs_df = barcodes
atac_obs_df = barcodes

rna_var_df = feature_ids.loc[is_rna].set_index("gene_name")
atac_var_df = feature_ids.loc[is_atac].set_index("gene_name")

rna_obsm = { "X_umap": rna_umap_df.values }
atac_obsm = { "X_umap": atac_umap_df.values }

In [19]:
rna = AnnData(X=mat_dense[:, is_rna], obs=rna_obs_df, obsm=rna_obsm, var=rna_var_df)
rna

  rna = AnnData(X=mat_dense[:, is_rna], obs=rna_obs_df, obsm=rna_obsm, var=rna_var_df)
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 3233 × 36601
    var: 'feature_id', 'feature_type', 'chr', 'chr_start', 'chr_end'
    obsm: 'X_umap'

In [20]:
atac = AnnData(X=mat_dense[:, is_atac], obs=atac_obs_df, obsm=atac_obsm, var=atac_var_df)
atac

  atac = AnnData(X=mat_dense[:, is_atac], obs=atac_obs_df, obsm=atac_obsm, var=atac_var_df)


AnnData object with n_obs × n_vars = 3233 × 134030
    var: 'feature_id', 'feature_type', 'chr', 'chr_start', 'chr_end'
    obsm: 'X_umap'

In [21]:
rna_zarr = join("data", "multiome_example.rna.h5ad.zarr")
atac_zarr = join("data", "multiome_example.atac.h5ad.zarr")

In [22]:
rna.write_zarr(rna_zarr, chunks=(rna.shape[0], VAR_CHUNK_SIZE))
atac.write_zarr(atac_zarr, chunks=(atac.shape[0], VAR_CHUNK_SIZE))

In [23]:
from vitessce import (
    VitessceConfig,
    ViewType as vt,
    CoordinationType as ct,
    FileType as ft,
    AnnDataWrapper,
    OmeTiffWrapper,
)

In [24]:
vc = VitessceConfig(schema_version="1.0.15", name='Multiome data', description='RNA+ATAC')

In [25]:
dataset = vc.add_dataset(name='RNA+ATAC').add_object(AnnDataWrapper(
    # We run add_object with adata_path=rna_zarr first to add the cell-by-gene matrix and associated metadata.
    adata_path=rna_zarr,
    obs_embedding_paths=["obsm/X_umap"],
    obs_embedding_names=["UMAP"],
    #obs_set_paths=["obs/seurat_clusters"],
    #obs_set_names=["Seurat Clusters"],
    obs_feature_matrix_path="X",
    # To be explicit that the features represent genes and gene expression, we specify that here.
    coordination_values={
        "featureType": "gene",
        "featureValueType": "expression"
    }
)).add_object(AnnDataWrapper(
    # We next run add_object with adata_path=adt_zarr to add the cell-by-ADT matrix and associated metadata.
    adata_path=atac_zarr,
    obs_embedding_paths=["obsm/X_umap"],
    obs_embedding_names=["UMAP"],
    #obs_set_paths=["obs/seurat_clusters"],
    #obs_set_names=["Seurat Clusters"],
    obs_feature_matrix_path="X",
    # If the features do not represent genes and gene expression, we specify alternate values here.
    coordination_values={
        "featureType": "peak",
        "featureValueType": "count"
    }
))

In [26]:
umap_scatterplot_by_rna = vc.add_view(vt.SCATTERPLOT, dataset=dataset, mapping="UMAP")
umap_scatterplot_by_atac = vc.add_view(vt.SCATTERPLOT, dataset=dataset, mapping="UMAP")

gene_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)
peak_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)

rna_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=True)
atac_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=True)

In [27]:
# We need to specify which of the two features (i.e., genes or tags) the different plots correspond to.
# We also need to make sure the selection of genes and tags are scoped to only the corresponding plots,
# and we want to make sure the color mappings are independent for each modality.
coordination_types = [ct.FEATURE_TYPE, ct.FEATURE_VALUE_TYPE, ct.FEATURE_SELECTION, ct.OBS_COLOR_ENCODING, ct.FEATURE_VALUE_COLORMAP_RANGE]
vc.link_views([umap_scatterplot_by_rna, gene_list, rna_heatmap], coordination_types, ["gene", "expression", None, 'cellSetSelection', [0.0, 0.3]])
vc.link_views([umap_scatterplot_by_atac, peak_list, atac_heatmap], coordination_types, ["peak", "count", None, 'cellSetSelection', [0.0, 1.0]])

# We can link the two scatterplots on their zoom level and (X,Y) center point so that zooming/panning is coordinated.
vc.link_views([umap_scatterplot_by_rna, umap_scatterplot_by_atac], [ct.EMBEDDING_ZOOM, ct.EMBEDDING_TARGET_X, ct.EMBEDDING_TARGET_Y], [3, 0, 0])

<vitessce.config.VitessceConfig at 0x7ffc383ed790>

In [28]:
# We define a layout for the plots using two rows.
# In the first row, we add the three gene-related visualizations,
# and in the second row, we add the three ADT-related visualizations.
vc.layout(
    (rna_heatmap | (umap_scatterplot_by_rna | gene_list))
    / (atac_heatmap | (umap_scatterplot_by_atac | peak_list))
);

In [29]:
vw = vc.widget()
vw

VitessceWidget(config={'version': '1.0.15', 'name': 'Multiome data', 'description': 'RNA+ATAC', 'datasets': [{…