# Label transfer

Here, we show how SuperSCC implements marker-genes-based label transfer.

In [61]:
import SuperSCC as scc
import pandas as pd
import scanpy as sc
import os
from sklearn.metrics import confusion_matrix, cohen_kappa_score, matthews_corrcoef, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [37]:
# read reference data
data = pd.read_csv('/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/未去批次效应couns数据/没有去除批次效应_Banovich_Kropski_2020数据.csv', index_col=0)
cell_type = pd.read_csv('/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/cell_type_info/finest/Banovich_Kropski_2020_finest_celltype.csv', index_col = 0)

In [None]:
# split train and test data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, cell_type, test_size= 0.3)

In [43]:
# do log-normalization for training and testing data
Xtrain = sc.AnnData(Xtrain.select_dtypes("number"))
sc.pp.normalize_total(Xtrain, target_sum = 1e4)
sc.pp.log1p(Xtrain)

Xtrain_norm = pd.DataFrame(Xtrain.X)
Xtrain_norm.columns = Xtrain.var_names
Xtrain_norm.index = Xtrain.obs_names
Xtrain_norm.loc[:, "cell_type"] = Ytrain.cell_type.values


Xtest = sc.AnnData(Xtest.select_dtypes("number"))
sc.pp.normalize_total(Xtest, target_sum = 1e4)
sc.pp.log1p(Xtest)

Xtest_norm = pd.DataFrame(Xtest.X)
Xtest_norm.columns = Xtest.var_names
Xtest_norm.index = Xtest.obs_names
Xtest_norm.loc[:, "cell_type"] = Ytest.cell_type.values


In [None]:
# find informative features in training data
my_logger = scc.log_file("logger", "a")
info_features = scc.feature_selection(Xtrain_norm.copy(), label_column = "cell_type", model = "svm",  normalization_method = "Min-Max", save = True, logger = my_logger)
info_features = [i[0] for i in info_features["final_feature_selection_by_ensemble"]] # use ensemble-selection features

In [49]:
# model training on training data
model = scc.model_training(Xtrain_norm.copy(), label_column = "cell_type", features = info_features, model = "svm", normalization_method = "Min-Max", save = True, logger = my_logger)

2025-02-07 15:41:12 start model training
2025-02-07 15:41:12 model traning based on svm algorithm
2025-02-07 15:41:13 doing Min-Max normalization
2025-02-07 15:41:13 doing label encoding
2025-02-07 15:41:13 grid search below paramters getting the best model
* C: [0.01 0.12 0.23 0.34 0.45 0.56 0.67 0.78 0.89 1.  ]
* kernel: ['rbf', 'poly', 'sigmoid', 'linear']




2025-02-07 15:55:10 finish model training


In [None]:
# do label transfer 
pred = scc.predict_label(Xtest_norm, models = ".+training_model.+pkl$", wk_dir = os.getcwd(), save=True, logger = my_logger)

2025-02-07 16:06:59 start label prediction based on svm_training_model_2025-02-07 15:55:10.pkl model
2025-02-07 16:07:28 finish label prediction based on svm_training_model_2025-02-07 15:55:10.pkl


In [55]:
pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"][0:5] # glance the predicted labels

['Alveolar macrophages',
 'EC venous pulmonary',
 'Monocyte-derived Mph',
 'Non-classical monocytes',
 'AT2']

In [56]:
# compare predicted labels with ground truth labels
confusion_matrix(Ytest, pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"])

array([[  19,    1,    7, ...,    0,    0,    4],
       [   1,  195,    5, ...,    0,    0,    0],
       [   4,    7, 1061, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    7,    0,    0],
       [   0,    0,    0, ...,    0,   11,    0],
       [   3,    0,    1, ...,    2,    0,  109]])

In [66]:
# evaulate the prediction
{
    "accuracy_score": accuracy_score(Ytest, pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"]),
    "f1_score": f1_score(Ytest, pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"], average= "weighted"),
    "cohen_kappa_score": cohen_kappa_score(Ytest, pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"]),
    "matthews_corrcoef": matthews_corrcoef(Ytest, pred["svm_training_model_2025-02-07 15:55:10.pkl"]["prediction"])
}

{'accuracy_score': 0.9083333333333333,
 'f1_score': 0.9055509253043628,
 'cohen_kappa_score': 0.9001176613948768,
 'matthews_corrcoef': 0.9001806199319379}