In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, cohen_kappa_score

import scBalance as sb

  from .autonotebook import tqdm as notebook_tqdm


### 1. Load dataset


In [2]:
v2_adata = sc.read_csv("Inter-dataset/PbmcBench/10Xv2/10Xv2_pbmc1.csv")
v2_label_data = pd.read_csv('Inter-dataset/PbmcBench/10Xv2/10Xv2_pbmc1Labels.csv')

In [3]:
v3_adata = sc.read_csv("Inter-dataset/PbmcBench/10Xv3/10Xv3_pbmc1.csv")
v3_label_data = pd.read_csv('Inter-dataset/PbmcBench/10Xv3/10Xv3_pbmc1Labels.csv')

### 2. Data normalization

The same as scanpy tutorial. For most of the users this may have already been done before annotation.

In [4]:
sc.pp.normalize_total(v2_adata, target_sum=1e4)
sc.pp.log1p(v2_adata)
sc.pp.normalize_total(v3_adata, target_sum=1e4)
sc.pp.log1p(v3_adata)

### 3. Find intersect gene set and integrate datasets

In [5]:
gene_names = v2_adata.var_names.intersection(v3_adata.var_names)
v2_adata = v2_adata[:, gene_names]
v3_adata = v3_adata[:, gene_names]

In [6]:
adata_concat = v2_adata.concatenate(v3_adata, batch_categories=['v2', 'v3'])

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


### 4. Using Combat to correct batch effect

Combat is embedded in the Scanpy package. This tool can return corrected scRNA-seq expression matrix.

In [7]:
sc.pp.combat(adata_concat, key='batch', covariates=None, inplace=True)

In [8]:
#### Extract batch-corrected datasets
adata_query = adata_concat[adata_concat.obs.batch.isin(['v2'])]
adata_ref = adata_concat[adata_concat.obs.batch.isin(['v3'])]

### 5. Train scBalance and perform cell type annotation

In [9]:
#training ref v3 query v2
v2_pred_label = sb.scBalance(adata_query.to_df().astype('float32'), adata_ref.to_df().astype('float32'), v3_label_data, processing_unit = 'cpu')

--------Start annotating----------
Computational unit be used is: cpu
--------Annotation Finished----------


The cell type label can be used into visualization, please find the PBMC 3k tutorial for more details.

In [10]:
#result evaluation
from sklearn.metrics import confusion_matrix,cohen_kappa_score
cohen_kappa_score(v2_label_data, v2_pred_label)

0.9701451670500495

In [11]:
f1_score(v2_label_data, v2_pred_label, average='macro')

0.8597287300186334

### 6. Compared with no batch effect removal method

In most cases, scBalance is robust enough for batch effect. Users could determine whether they need Combat before annotation. Usually, we recommend using scBalance directly, especially when dealing with the atlas-scale reference dataset.

In [12]:
v2_label_data = pd.read_csv('Inter-dataset/PbmcBench/10Xv2/10Xv2_pbmc1Labels.csv')
v3_label_data = pd.read_csv('Inter-dataset/PbmcBench/10Xv3/10Xv3_pbmc1Labels.csv')

In [13]:
#training ref v3 query v2
v2_pred_label = sb.scBalance(v2_adata.to_df(), v3_adata.to_df(), v3_label_data, processing_unit = 'cpu')

--------Start annotating----------
Computational unit be used is: cpu
--------Annotation Finished----------


The result will basically remain the same with or without the batch correction method.

In [14]:
#result evaluation
from sklearn.metrics import confusion_matrix,cohen_kappa_score
cohen_kappa_score(v2_label_data, v2_pred_label)

0.9709676782417723

In [15]:
f1_score(v2_label_data, v2_pred_label, average='macro')

0.8614133291153494