In [1]:
import glob
import time

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import pynndescent
import seaborn as sns
from ALLCools.clustering import significant_pc_test, tsne
from ALLCools.integration import SeuratIntegration
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from wmb import aibs, brain, broad, cemba, mm10

In [2]:
group_name = "HB"

In [3]:
def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f"{name}_{i}"] = adata.obsm[f"X_{name}"][:, i]
    return adata

In [4]:
ref_adata = anndata.read_h5ad("aibs_10x.h5ad")
ref_adata.var['mean'] = ref_adata.X.mean(axis=0).A1
ref_adata.var['std'] = (ref_adata.X.multiply(ref_adata.X)).mean(axis=0).A1 - (ref_adata.var['mean'].values ** 2)
print(ref_adata.var['std'].min())
ref_adata = ref_adata[:, ref_adata.var['std']>1e-5].copy()


1.3148029e-05


In [5]:
qry_adata = anndata.read_h5ad("merfish.h5ad")
qry_adata.var['mean'] = qry_adata.X.mean(axis=0).A1
qry_adata.var['std'] = (qry_adata.X.multiply(qry_adata.X)).mean(axis=0).A1 - (qry_adata.var['mean'].values ** 2)
print(qry_adata.var['std'].min())
qry_adata = qry_adata[:, qry_adata.var['std']>1e-5].copy()


0.004044262575437655


In [6]:
ncell = ref_adata.shape[0] + qry_adata.shape[0]
ncc = significant_pc_test(ref_adata, p_cutoff=0.1, update=False, obsm="X_pca")
ncc = min(50, ncc, ref_adata.shape[0] - 1, qry_adata.shape[0] - 1, ref_adata.shape[1] // 5)
ncc = max(ncc, 5)
npc = min([50, ncc + 10, ref_adata.shape[0] - 1, ref_adata.obsm["X_pca"].shape[1]])
print(npc, ncc, ref_adata.shape[0], qry_adata.shape[0])


Downsample PC matrix to 50000 cells to calculate significant PC components
33 components passed P cutoff of 0.1.
43 33 67701 8410


In [7]:
for xx in [ref_adata, qry_adata]:
    xx.obsm["X_pca"] = normalize(xx.obsm["X_pca"][:, :npc], axis=1)
    

In [8]:
qry_adata.obs["Study"] = "MERFISH"
ref_adata.obs["Study"] = "AIBS_10x"

In [9]:
adata_merge = anndata.AnnData(
    X=np.ones((ncell, 1)), obs=pd.concat([ref_adata.obs, qry_adata.obs], axis=0)
)

In [10]:
integrator = SeuratIntegration()
adata_list = [ref_adata, qry_adata]
start_time = time.time()
integrator.find_anchor(
    adata_list,
    k_local=None,
    key_local="X_pca",
    k_anchor=5,
    key_anchor="X",
    dim_red="cca",
    max_cc_cells=50000,
    k_score=30,
    # k_filter=min(200, ref_adata.shape[0] // 10),
    k_filter=None,
    scale1=True,
    scale2=True,
    # scale =[False, True]
    n_components=ncc,
    n_features=min(200, ncc * 10),
    alignments=[[[0], [1]]],
)
print(time.time() - start_time)
start_time = time.time()
corrected = integrator.integrate(
    key_correct="X_pca",
    row_normalize=True,
    n_components=npc,
    k_weight=min(100, integrator.anchor[(0, 1)].shape[0]),
    sd=1,
    alignments=[[[0], [1]]],
)
print(time.time() - start_time)
integrator.save("integration_aibs10x_merfish")
adata_merge.obsm["X_pca_corrected"] = np.concatenate(corrected, axis=0)


Find anchors across datasets.
Run CCA
non zero dims 33
Find Anchors using k=30
Score Anchors
Identified 26280 anchors between datasets 0 and 1.
150.28015446662903
Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
18.42306637763977


In [11]:
start_time = time.time()
tsne(
    adata_merge,
    obsm="X_pca_corrected",
    metric="euclidean",
    exaggeration=-1,
    perplexity=50,
    n_jobs=-1,
)
dump_embedding(adata_merge, "tsne")
adata_merge.obsm[f"u{npc}seurat_tsne"] = adata_merge.obsm["X_tsne"].copy()
print(time.time() - start_time)


331.5852816104889


In [12]:
for i in range(3):
    adata_merge.obs[f"L{i+1}"] = adata_merge.obs[f"L{i+1}"].astype(str)
    adata_merge.obs.loc[adata_merge.obs["Study"] == "AIBS_10x", f"L{i+1}"] = (
        adata_merge.obs.loc[adata_merge.obs["Study"] == "AIBS_10x", f"L{i+1}"]
        .astype(float)
        .astype(int)
        .astype(str)
    )

In [13]:
adata_merge.obs.loc[:, adata_merge.obs.dtypes == "object"] = adata_merge.obs.loc[
    :, adata_merge.obs.dtypes == "object"
].astype(str)
adata_merge.write_h5ad("aibs10xmerfish.h5ad")
