In [1]:
import anndata
import numpy as np
from sklearn.decomposition import TruncatedSVD


In [2]:
group_name = 'HB'

In [3]:
ref_adata = anndata.read_h5ad('aibs_10x.h5ad')
ref_adata

AnnData object with n_obs × n_vars = 67701 × 487
    obs: 'count', 'umi_count', 'L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion', 'Train', 'cocluster'
    var: 'name'
    uns: 'log1p'
    obsm: 'X_pca'

In [4]:
np.random.seed(0)

n_train_cell = 100000
# select mC cells to fit the model
train_cell = np.zeros(ref_adata.shape[0]).astype(bool)
if ref_adata.shape[0] > n_train_cell:
    train_cell[np.random.choice(np.arange(ref_adata.shape[0]), n_train_cell, False)] = True
else:
    train_cell[:] = True

ref_adata.obs['Train'] = train_cell.copy()
ref_adata.obs['Train'].sum()


67701

In [5]:
ndim = min(100, ref_adata.obs['Train'].sum()-1, ref_adata.shape[1]-1)
model = TruncatedSVD(n_components=ndim, algorithm='arpack', random_state=0) 
model.fit(ref_adata.X[ref_adata.obs['Train'].values])
sel_dim = (model.singular_values_ != 0)
print(sel_dim.sum())


100


In [6]:
## Transform 10x

chunk_size = 50000
chunks = []
for chunk_start in range(0, ref_adata.shape[0], chunk_size):
    chunks.append(
        model.transform(ref_adata.X[chunk_start:(chunk_start + chunk_size)]))

ref_adata.obsm['X_pca'] = np.concatenate(chunks, axis=0)[:, sel_dim]
ref_adata.obsm['X_pca'] /= model.singular_values_[sel_dim]


In [7]:
ref_adata.write_h5ad('aibs_10x.h5ad')


In [8]:
qry_adata = anndata.read_h5ad('merfish.h5ad')
qry_adata


AnnData object with n_obs × n_vars = 8410 × 487
    obs: 'blank_count', 'n_counts', 'n_genes', 'Study'
    var: 'id', 'mean', 'std'
    uns: 'log1p'
    obsm: 'X_pca', 'X_pca_corrected'

In [9]:
chunks = []
for chunk_start in range(0, qry_adata.shape[0], chunk_size):
    # tmp = (qry_adata.X[chunk_start:(chunk_start + chunk_size)].toarray() - qry_adata.var['mean'].values) / qry_adata.var['std'].values
    tmp = (qry_adata.X[chunk_start:(chunk_start + chunk_size)])
    chunks.append(model.transform(tmp))
    print(chunk_start)

qry_adata.obsm['X_pca'] = np.concatenate(chunks, axis=0)[:, sel_dim]
qry_adata.obsm['X_pca'] /= model.singular_values_[sel_dim]


0


In [10]:
qry_adata.write_h5ad('merfish.h5ad')
