### Visualize the UMAP embeddings across the different integration methods and save file for calculating LISI integration metric

In [1]:
import scanpy as sc
from collections import Counter
import anndata as ad
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import gc

In [2]:
sc._settings.settings._vector_friendly=True

### scVI

In [3]:
%%time
scvi_adata = sc.read_h5ad("04A_scvi_sc_sn_combined.h5ad")
scvi_adata.obs['scvi_UMAP1'] = scvi_adata.obsm['X_umap'][:, 0]
scvi_adata.obs['scvi_UMAP2'] = scvi_adata.obsm['X_umap'][:, 1]

CPU times: user 1.38 s, sys: 14.4 s, total: 15.8 s
Wall time: 15.9 s


In [4]:
scvi_adata_metadata = scvi_adata.obs.reset_index()

### CellANOVA

In [5]:
CellANOVA_adata = sc.read_h5ad("04B_post_CellANOVA.h5ad")

In [6]:
%%time
integrated = ad.AnnData(CellANOVA_adata.layers['denoised'], dtype=np.float32)
integrated.obs = CellANOVA_adata.obs.copy()
integrated.var_names = CellANOVA_adata.var_names
sc.pp.neighbors(integrated, n_neighbors=15, n_pcs=30)
sc.tl.umap(integrated)

         Falling back to preprocessing with `sc.pp.pca` and default params.


CPU times: user 16min 5s, sys: 31min 36s, total: 47min 41s
Wall time: 4min 15s


In [7]:
integrated.obs['CellANOVA_UMAP1'] = integrated.obsm['X_umap'][:, 0]
integrated.obs['CellANOVA_UMAP2'] = integrated.obsm['X_umap'][:, 1]

In [8]:
CellANOVA_metadata = integrated.obs.reset_index()
CellANOVA_metadata = CellANOVA_metadata[["index", "CellANOVA_UMAP1", "CellANOVA_UMAP2"]]

### Harmony 

In [9]:
harmony_adata = sc.read_h5ad("04C_harmony_integrated_adata.h5ad")
harmony_adata.obs['harmony_UMAP1'] = harmony_adata.obsm['X_umap'][:, 0]
harmony_adata.obs['harmony_UMAP2'] = harmony_adata.obsm['X_umap'][:, 1]

In [10]:
harmony_metadata = harmony_adata.obs.reset_index()
harmony_metadata = harmony_metadata[["index", "harmony_UMAP1", "harmony_UMAP2"]]

### No integration

In [11]:
no_integration_adata = sc.read_h5ad("04D_adata_without_batch_correction.h5ad")
no_integration_adata.obs['no_integration_UMAP1'] = no_integration_adata.obsm['X_umap'][:, 0]
no_integration_adata.obs['no_integration_UMAP2'] = no_integration_adata.obsm['X_umap'][:, 1]
no_integration_adata

AnnData object with n_obs × n_vars = 121628 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'tech_plus_study', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'no_integration_UMAP1', 'no_integration_UMAP2'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'cell_or_nuclei_colors', 'consistent_cell_type_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'study_colors', 'technology_colors', 'umap'
    obsm: 'X_pca', 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [12]:
no_integration_metadata = no_integration_adata.obs.reset_index()
no_integration_metadata = no_integration_metadata[["index", "no_integration_UMAP1", "no_integration_UMAP2"]]

### Add the UMAP dimensions together and save metadata

In [13]:
all_metadata = scvi_adata_metadata.merge(CellANOVA_metadata, on = "index")
all_metadata = all_metadata.merge(harmony_metadata, on = "index")
all_metadata = all_metadata.merge(no_integration_metadata, on = "index")

In [14]:
all_metadata.to_csv("05_adata_metadata_for_LISI.csv")

In [15]:
all_metadata.columns

Index(['index', 'age', 'donor_id', 'sex', 'region', 'cell_type',
       'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei',
       'tech_plus_study', '_scvi_batch', '_scvi_labels', 'leiden_scVI',
       'scvi_cell_type', 'scvi_UMAP1', 'scvi_UMAP2', 'CellANOVA_UMAP1',
       'CellANOVA_UMAP2', 'harmony_UMAP1', 'harmony_UMAP2',
       'no_integration_UMAP1', 'no_integration_UMAP2'],
      dtype='object')