In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from os.path import join
from anndata import AnnData
from mudata import MuData
from muon import prot as pt

In [None]:
!pwd

In [None]:
rna_df = pd.read_csv(join("..", "data", "raw", "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz"))

In [None]:
rna_df = rna_df.rename(columns={'Unnamed: 0': "gene_id"})

In [None]:
rna_df = rna_df.loc[rna_df["gene_id"].str.startswith("HUMAN_")]

In [None]:
rna_df["gene_id"] = rna_df["gene_id"].apply(lambda gene_name: gene_name[len("HUMAN_"):])
rna_df = rna_df.set_index("gene_id")

In [None]:
adt_df = pd.read_csv(join("..", "data", "raw", "GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz"), index_col=0)

In [None]:
rna_adata = AnnData(X=rna_df.T.values, obs=pd.DataFrame(index=rna_df.columns.values, data=[]), var=pd.DataFrame(index=rna_df.index.values, data=[]))
rna_adata

In [None]:
adt_adata = AnnData(X=adt_df.T.values, obs=pd.DataFrame(index=adt_df.columns.values, data=[]), var=pd.DataFrame(index=adt_df.index.values, data=[]))
adt_adata

In [None]:
mdata = MuData({ "rna": rna_adata, "adt": adt_adata })
mdata

In [None]:
rna = mdata.mod["rna"]
adt = mdata.mod["adt"]

In [None]:
sc.pp.normalize_total(rna, target_sum=1e4)

In [None]:
sc.pp.log1p(rna)

In [None]:
sc.pp.highly_variable_genes(rna)

In [None]:
rna.raw = rna

In [None]:
rna = rna[:, rna.var.highly_variable]

In [None]:
sc.pp.scale(rna, max_value=10)

In [None]:
sc.tl.pca(rna, svd_solver='arpack')

In [None]:
sc.pl.pca(rna, color='CST3')

In [None]:
sc.pp.neighbors(rna, n_neighbors=10, n_pcs=30)

In [None]:
sc.tl.umap(rna)

In [None]:
sc.tl.leiden(rna)

In [None]:
mdata.mod["rna"] = rna

In [None]:
pt.pp.clr(adt)

In [None]:
adt.obsm["X_umap"] = rna.obsm["X_umap"]
adt.obs["leiden"] = rna.obs["leiden"]

In [None]:
mdata

In [None]:
mdata.write_zarr(join("..", "data", "processed", "GSE100866_CBMC_8K_13AB_10X.mdata.zarr"))

In [None]:
from vitessce import (
    VitessceConfig,
    ViewType as vt,
    CoordinationType as ct,
    FileType as ft,
    AnnDataWrapper,
)

In [None]:
vc = VitessceConfig(schema_version="1.0.15", name='CITE-seq example', description='RNA+ADT')

In [None]:
rna_zarr = join("..", "data", "processed", "GSE100866_CBMC_8K_13AB_10X.mdata.zarr", "mod", "rna")
adt_zarr = join("..", "data", "processed", "GSE100866_CBMC_8K_13AB_10X.mdata.zarr", "mod", "adt")

In [None]:
dataset = vc.add_dataset(name='CBMC 8K').add_object(AnnDataWrapper(
    # We run add_object with adata_path=rna_zarr first to add the cell-by-gene matrix and associated metadata.
    adata_path=rna_zarr,
    obs_embedding_paths=["obsm/X_umap", "obsm/X_pca"],
    obs_embedding_names=["UMAP", "PCA"],
    obs_set_paths=["obs/leiden"],
    obs_set_names=["Leiden Clusters"],
    obs_feature_matrix_path="X",
    # To be explicit that the features represent genes and gene expression, we specify that here.
    coordination_values={
        "featureType": "gene",
        "featureValueType": "expression"
    }
)).add_object(AnnDataWrapper(
    # We next run add_object with adata_path=adt_zarr to add the cell-by-ADT matrix and associated metadata.
    adata_path=adt_zarr,
    obs_embedding_paths=["obsm/X_umap"],
    obs_embedding_names=["UMAP"],
    obs_set_paths=["obs/leiden"],
    obs_set_names=["Leiden Clusters"],
    obs_feature_matrix_path="X",
    # If the features do not represent genes and gene expression, we specify alternate values here.
    coordination_values={
        "featureType": "tag",
        "featureValueType": "count"
    }
))

In [None]:
umap_scatterplot_by_rna = vc.add_view(vt.SCATTERPLOT, dataset=dataset, mapping="UMAP")
umap_scatterplot_by_adt = vc.add_view(vt.SCATTERPLOT, dataset=dataset, mapping="UMAP")

gene_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)
protein_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)

rna_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=True)
adt_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=True)

In [None]:
# We need to specify which of the two features (i.e., genes or tags) the different plots correspond to.
# We also need to make sure the selection of genes and tags are scoped to only the corresponding plots,
# and we want to make sure the color mappings are independent for each modality.
coordination_types = [ct.FEATURE_TYPE, ct.FEATURE_VALUE_TYPE, ct.FEATURE_SELECTION, ct.OBS_COLOR_ENCODING, ct.FEATURE_VALUE_COLORMAP_RANGE]
vc.link_views([umap_scatterplot_by_rna, gene_list, rna_heatmap], coordination_types, ["gene", "expression", None, 'cellSetSelection', [0.0, 0.3]])
vc.link_views([umap_scatterplot_by_adt, protein_list, adt_heatmap], coordination_types, ["tag", "count", None, 'cellSetSelection', [0.0, 1.0]])

# We can link the two scatterplots on their zoom level and (X,Y) center point so that zooming/panning is coordinated.
vc.link_views([umap_scatterplot_by_rna, umap_scatterplot_by_adt], [ct.EMBEDDING_ZOOM, ct.EMBEDDING_TARGET_X, ct.EMBEDDING_TARGET_Y], [3, 0, 0])

In [None]:
# We define a layout for the plots using two rows.
# In the first row, we add the three gene-related visualizations,
# and in the second row, we add the three ADT-related visualizations.
vc.layout(
    (rna_heatmap | (umap_scatterplot_by_rna | gene_list))
    / (adt_heatmap | (umap_scatterplot_by_adt | protein_list))
);

In [None]:
vw = vc.widget()
vw

In [None]:
vc.web_app()