In [1]:
import time
import glob
import numpy as np
from sklearn.preprocessing import normalize

import anndata
from ALLCools.integration.seurat_class import SeuratIntegration


In [2]:
# Parameters
group_name = "STR"


In [3]:
t = group_name

In [4]:
npc = 50
ncc = 50

# Integrate RS1 mC and RNA


In [5]:
integrator = SeuratIntegration()

In [6]:
ref_adata = anndata.read_h5ad(glob.glob(f'{t}_*_rs1_mch.h5ad')[0])
qry_adata = anndata.read_h5ad(glob.glob(f'{t}_*_rna.h5ad')[0])

adata_list = [ref_adata, qry_adata]
for xx in adata_list:
    xx.obsm['X_pca'] = normalize(xx.obsm['cef_pca'][:, :npc], axis=1)

adata_list

[AnnData object with n_obs × n_vars = 12546 × 3402
     obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'DissectionRegion', 'Plate', 'Col384', 'Row384', 'Slice', 'Sample', 'Technology', 'InputReads', 'PassBasicQC', 'PlateNormCov', 'CEMBARegion', 'MajorRegion', 'SubRegion', 'L1', 'L1_annot', 'L4', 'L4Region', 'L2_annot', 'Train'
     var: 'chrom', 'cov_mean', 'end', 'start', 'L4_enriched_features'
     uns: 'L4_feature_enrichment', 'log'
     obsm: 'cef_pca', 'X_pca',
 AnnData object with n_obs × n_vars = 41012 × 3402
     obs: 'count', 'umi_count', 'L1', 'L2', 'L3', 'L1_annot', 'L2_annot', 'DissectionRegion', 'SubRegion', 'MajorRegion'
     var: 'name', 'mean', 'std'
     uns: 'log1p'
     obsm: 'cef_pca', 'X_pca']

In [7]:
anchor = integrator.find_anchor(adata_list,
                                k_local=None,
                                key_local='X_pca',
                                k_anchor=5,
                                key_anchor='X',
                                dim_red='cca',
                                max_cc_cells=50000,
                                k_score=30,
                                k_filter=None,
                                scale1=False,
                                scale2=True,
                                #scale =[False, True]
                                n_components=ncc,
                                n_features=200,
                                alignments=[[[0], [1]]])


Find anchors across datasets.
Run CCA
non zero dims 50
Find Anchors using k=30
Score Anchors
Identified 28506 anchors between datasets 0 and 1.


In [8]:
start_time = time.time()
corrected = integrator.integrate(key_correct='X_pca',
                                 row_normalize=True,
                                 n_components=npc,
                                 k_weight=100,
                                 sd=1,
                                 alignments=[[[0], [1]]])

print(time.time() - start_time)


Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
47.89653944969177


In [9]:
qry_adata.obsm['X_pca_corrected'] = normalize(corrected[1], axis=1)
qry_adata.write_h5ad(f'{t}_{qry_adata.shape[0]}_rna.h5ad')


In [10]:
integrator.save(f'{t}_integration_rs1_rna')


# Integrate RS1 mC and RS2 EpiRetro

In [11]:
integrator = SeuratIntegration()

In [12]:
ref_adata = anndata.read_h5ad(glob.glob(f'{t}_*_rs1_mch.h5ad')[0])
qry_adata = anndata.read_h5ad(glob.glob(f'{t}_*_rs2_mch.h5ad')[0])

adata_list = [ref_adata, qry_adata]
for xx in adata_list:
    xx.obsm['X_pca'] = normalize(xx.obsm['cef_pca'][:, :npc], axis=1)

adata_list

[AnnData object with n_obs × n_vars = 12546 × 3402
     obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'DissectionRegion', 'Plate', 'Col384', 'Row384', 'Slice', 'Sample', 'Technology', 'InputReads', 'PassBasicQC', 'PlateNormCov', 'CEMBARegion', 'MajorRegion', 'SubRegion', 'L1', 'L1_annot', 'L4', 'L4Region', 'L2_annot', 'Train'
     var: 'chrom', 'cov_mean', 'end', 'start', 'L4_enriched_features'
     uns: 'L4_feature_enrichment', 'log'
     obsm: 'cef_pca', 'X_pca',
 AnnData object with n_obs × n_vars = 198 × 3402
     obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'Exp', 'Source', 'Slice', 'Target', 'Gender', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PlateNormReads'
     var: 'chrom', 'end', 'start', '_feature_select'
     uns: 'log'
     obsm: 'cef_pca', 'X_pca']

In [13]:
start_time = time.time()
anchor = integrator.find_anchor(adata_list,
                                k_local=None,
                                key_local='X_pca',
                                k_anchor=5,
                                key_anchor='X',
                                dim_red='rpca',
                                max_cc_cells=100000,
                                k_score=30,
                                scale1=False,
                                scale2=False,
                                n_components=npc,
                                n_features=200,
                                alignments=[[[0], [1]]])
print(time.time() - start_time)


Find anchors across datasets.
Run rPCA
Score Anchors
Identified 336 anchors between datasets 0 and 1.
10.138026237487793


In [14]:
start_time = time.time()
corrected = integrator.integrate(key_correct='X_pca',
                                 row_normalize=True,
                                 n_components=npc,
                                 k_weight=100,
                                 sd=1,
                                 alignments=[[[0], [1]]])

print(time.time() - start_time)


Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
3.5855557918548584


In [15]:
qry_adata.obsm['X_pca_corrected'] = normalize(corrected[1], axis=1)
qry_adata.write_h5ad(f'{t}_{qry_adata.shape[0]}_rs2_mch.h5ad')


In [16]:
integrator.save(f'{t}_integration_rs1_rs2')
