In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, cohen_kappa_score
import os

In [3]:
import scBalance as sb

  from .autonotebook import tqdm as notebook_tqdm


## 1. Model saving and reusing

Load data and preprocess

In [4]:
adata = sc.read_csv("../scBalance_dataset/Baron Human/Filtered_Baron_HumanPancreas_data.csv")

In [5]:
label_data = pd.read_csv('../scBalance_dataset/Baron Human/Labels.csv')

In [6]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

data = adata.to_df()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, label_data)

Running scBalance

In [7]:
pred_label = sb.scBalance(X_test, X_train, y_train, processing_unit = 'gpu', save_model=True, weighted_sampling=True)

--------Start annotating----------
No GPUs are available on your server.
Computational unit be used is: cpu
--------Annotation Finished----------
Model is saved at:/net/csefiles/xzhanglab/ycheng430/scATAC_classification_project/scbalance.pkl

Dict file is saved at:/net/csefiles/xzhanglab/ycheng430/scATAC_classification_project/dict_file.pkl


You can see that the model and cell type dictionary is saved in your root path.

Metrics calculation

In [10]:
from sklearn.metrics import confusion_matrix,cohen_kappa_score
cm = confusion_matrix(y_test, pred_label, labels = list(set(pred_label)))

In [11]:
cohen_kappa_score(y_test,pred_label)

0.9883866366659296

Load model

In [12]:
whole_data = adata.to_df()

Please use the path as the iput of the "load_model" parameter and "load_dict" parameter.

Here, we use pre-trained model to predict the whole baron human dataset as an example.

In [13]:
pred_label_2 = sb.scBalance(test=whole_data, load_model = "/net/csefiles/xzhanglab/ycheng430/scATAC_classification_project/scbalance.pkl", load_dict = "/net/csefiles/xzhanglab/ycheng430/scATAC_classification_project/dict_file.pkl")

Loading model
Prediction finished


You may see the pretrained model successfully predict the cell type in the whole baron human dataset.

In [14]:
f1_score(label_data, pred_label_2,average='micro')

0.9976660053681876

In [15]:
label_data

Unnamed: 0,x
0,acinar
1,acinar
2,acinar
3,acinar
4,acinar
...,...
8564,activated_stellate
8565,alpha
8566,beta
8567,beta


In this way, for the large dataset, you do not need to re-train your model but only need to reuse the trained model for the further prediction

## 2. Sampling technique choosing

In this section, you can see how to choose sampling method for the imbalanced dataset. We added a "weighted_sampling" parameter for users who would like to use external sampling method instead of our weighted sampling. 

Load data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, label_data, random_state=5, stratify=label_data)

When using "weighted_samling" as True, the internal weighted sampling technique is used:

In [8]:
pred_label = sb.scBalance(X_test, X_train, y_train, processing_unit = 'gpu', save_model=False, weighted_sampling=True)

--------Start annotating----------
No GPUs are available on your server.
Computational unit be used is: cpu
--------Annotation Finished----------


In [9]:
f1_score(y_test, pred_label,average='macro')

0.9716602004456567

When set this parameter as False, for now no sampling method is applied. You may see the differences in the f1 score

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, label_data, random_state=5, stratify=label_data)

In [12]:
pred_label = sb.scBalance(X_test, X_train, y_train, processing_unit = 'gpu', save_model=False, weighted_sampling=False)

--------Start annotating----------
No GPUs are available on your server.
Computational unit be used is: cpu
--------Annotation Finished----------


In [13]:
f1_score(y_test, pred_label,average='macro')

0.8656051742738423