In [1]:
from SuperSCC import list_files
from sklearn.metrics import accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, balanced_accuracy_score, matthews_corrcoef
import pandas as pd

from os.path import basename
import re

In [None]:
# define scoring function

def scoring(data, methods, **kwargs):
    res = list()
    for i in data.keys():
        intermediate = list()
        for method in methods:
            y_pred = data[i].y_pred.values
            y_true = data[i].y_true.values
            try:
                method_name = re.findall("[^\W]+", str(method))[1]
                score = method(y_true, y_pred, **kwargs)
                intermediate.append(pd.DataFrame({"Dataset": [i], "score": [method_name], "value": [score]}))
            except Exception as e:
                method_name = re.findall("[^\W]+", str(method))[1]
                score = method(y_true, y_pred)
                intermediate.append(pd.DataFrame({"Dataset": [i], "score": [method_name], "value": [score]}))
                print(f"Error with method {method_name}: {e}")

        res.append(pd.concat(intermediate))
        
    return pd.concat(res)

# When self validation experiment

In [None]:
# get the ground truth test cell label when running SuperSCC
cell_label_loc = list_files(path="/mnt/disk5/zhongmin/superscc/label_transfer/代码/SuperSCC/finest_cell_label_res/", pattern=".+test_dataset.+")

cell_label_dict = dict()
for i in cell_label_loc:
    name = basename(i)
    name = re.findall("(.+)_test_dataset.+", name)[0]
    cell_label_dict[name] = pd.read_csv(i, index_col=0)

In [None]:
# get the SuperSCC prediction results
pred_res_loc = list_files(path="/mnt/disk5/zhongmin/superscc/label_transfer/代码/SuperSCC/finest_cell_label_res/", pattern=".+prediction_result.+")

pred_dict = dict()
for i in pred_res_loc:
    name = basename(i)
    name = re.findall("(.+)_svm.+", name)[0]
    pred_dict[name] = pd.read_pickle(i)
    
# tidy up SuperSCC results
superscc_dict = dict()

for i in pred_dict.keys():
    key = list(pred_dict[i].keys())[0]
    y_pred = pred_dict[i][key]["prediction"]
    y_true = cell_label_dict[i].cell_type.values

    superscc_dict[i] = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})

In [6]:
# get the SingleCellNet prediction result
scn_pred_res_los = list_files(path = "/mnt/disk5/zhongmin/superscc/label_transfer/代码/SingleCellNet/finest_cell_label_res/", pattern = ".+cell_type.csv")

scn_dict = dict()

for i in scn_pred_res_los:
    name = basename(i)
    name = re.findall("(.+)_SingleCellNet.+", name)[0]
    scn_dict[name] = pd.read_csv(i)

In [4]:
# get the SingleR prediction result
singler_pred_res_los = list_files(path = "/mnt/disk5/zhongmin/superscc/label_transfer/代码/SingleR/finest_cell_label_res/", pattern = ".+prediction.csv")

singler_dict = dict()

for i in singler_pred_res_los:
    name = basename(i)
    name = re.findall("(.+)_SingleR.+", name)[0]
    singler_dict[name] = pd.read_csv(i)

In [50]:
# get the scANVI prediction result
scANVI_pred_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/scANVI", pattern = ".+csv$", full_name = True, recursive=False)
file = pd.read_csv("/home/fengtang/jupyter_notebooks/working_script/label_transfer/SingleCellNet/label_transfer_evulate_data_loc.csv", index_col=0)
file["test_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/",i) for i in file["test_data"].tolist()]
file["ref_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/", i) for i in file["ref_data"].tolist()]


scANVI_dict = dict()

for i in scANVI_pred_res_loc:
    name = basename(i)
    name = re.findall("[^\.]+", name)[0]
    data = pd.read_csv(i, index_col = 0)
    y_true = pd.read_csv(file.loc[file.filename == name, :].loc[:, "test_data"].values[0], index_col=0)
    data = data.loc[data.ref_or_not == "test", :].loc[:,  ["C_scANVI"]]
    data2 = data.join(y_true, how="left")
    data2.columns = ["y_pred", "y_true"]
    data2 = data2.loc[:, ["y_true", "y_pred"]]
    scANVI_dict[name] = data2

In [45]:
# get the scmap prediction result
scmap_pred_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/scmap", pattern = ".+csv$", full_name = True, recursive=False)
file = pd.read_csv("/home/fengtang/jupyter_notebooks/working_script/label_transfer/SingleCellNet/label_transfer_evulate_data_loc.csv", index_col=0)
file["test_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/",i) for i in file["test_data"].tolist()]
file["ref_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/", i) for i in file["ref_data"].tolist()]


scmap_dict = dict()

for i in scmap_pred_res_loc:
    name = basename(i)
    name = re.sub("_scmap_prediction.csv","", name)
    data = pd.read_csv(i, index_col = 0)
    y_true = pd.read_csv(file.loc[file.filename == name, :].loc[:, "test_data"].values[0], index_col=0)
    data.loc[:, "y_true"] = y_true.cell_type.tolist()
    data2 = data.loc[:, ["y_true", "y_pred"]]
    scmap_dict[name] = data2

In [58]:
# get the seurat prediction result
seurat_pred_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/Seurat", pattern = ".+csv$", full_name = True, recursive=False)
file = pd.read_csv("/home/fengtang/jupyter_notebooks/working_script/label_transfer/SingleCellNet/label_transfer_evulate_data_loc.csv", index_col=0)
file["test_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/",i) for i in file["test_data"].tolist()]
file["ref_data"] = [re.sub("代码/", "代码/SuperSCC/finest_cell_label_res/", i) for i in file["ref_data"].tolist()]


seurat_dict = dict()

for i in seurat_pred_res_loc:
    name = basename(i)
    name = re.sub("_seurat_prediction.csv","", name)
    data = pd.read_csv(i, index_col = 0)
    y_true = pd.read_csv(file.loc[file.filename == name, :].loc[:, "test_data"].values[0], index_col=0)
    data.loc[:, "y_true"] = y_true.cell_type.tolist()
    data2 = data.loc[:, ["y_true", "y_pred"]]
    seurat_dict[name] = data2

In [None]:
# scoring on different methods
superscc_score = scoring(superscc_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
superscc_score.loc[:, "method"] = "SuperSCC"
superscc_score.to_csv("superscc_score_on_finest_cell_label.csv")

singler_score = scoring(singler_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
singler_score.loc[:, "method"] = "SingleR"
singler_score.to_csv("singler_score_on_finest_cell_label.csv")

scn_score = scoring(scn_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scn_score.loc[:, "method"] = "SingleCellNet"
scn_score.to_csv(f"scn_score_on_finest_cell_label.csv")


scANVI_score = scoring(scANVI_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scANVI_score.loc[:, "method"] = "scANVI"
scANVI_score.to_csv(f"scANVI_score_on_finest_cell_label_{record_time()}.csv")

scmap_score = scoring(scmap_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scmap_score.loc[:, "method"] = "scmap"
scmap_score.to_csv(f"scmap_score_on_finest_cell_label_{record_time()}.csv")

seurat_score = scoring(seurat_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
seurat_score.loc[:, "method"] = "seurat"
seurat_score.to_csv(f"seurat_score_on_finest_cell_label_{record_time()}.csv")

# When cross validation experiment

In [None]:
# get the ground truth test cell label when running SuperSCC
# get file loc 
file_loc = pd.read_csv("/mnt/disk5/zhongmin/superscc/结果位置/结果位置_3.csv", encoding = "GBK")

file_loc = pd.DataFrame(file_loc.loc[file_loc.数据集.isin(['Banovich_Kropski_2020', 'Barbry_Leroy_2020', 'Krasnow_2020', 'Lafyatis_Rojas_2019', 'Meyer_2019', 'Misharin_2021', 'Misharin_Budinger_2018', 'Nawijn_2021', 'Teichmann_Meyer_2019']), ["数据集", "取到的2w子集文件的位置"]])

raw_cell_type_file_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/cell_type_info/raw", pattern=".+csv$", full_name=True)

finest_cell_type_file_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/cell_type_info/finest", pattern=".+finest.+csv$", full_name=True)

file_loc.loc[:, "raw_cell_type"] = [raw_cell_type_file_loc[2], raw_cell_type_file_loc[4], raw_cell_type_file_loc[6], raw_cell_type_file_loc[0], raw_cell_type_file_loc[5], raw_cell_type_file_loc[-2], raw_cell_type_file_loc[1], raw_cell_type_file_loc[-1], raw_cell_type_file_loc[3]]
file_loc.loc[:, "finest_cell_type"] = [finest_cell_type_file_loc[-2], finest_cell_type_file_loc[3], finest_cell_type_file_loc[-3], finest_cell_type_file_loc[1], finest_cell_type_file_loc[0], finest_cell_type_file_loc[-4], finest_cell_type_file_loc[2], finest_cell_type_file_loc[-1], finest_cell_type_file_loc[5]]

cell_label_dict = dict()
for i in pred_dict.keys():
    loc = file_loc.loc[file_loc.数据集 == i, :].iloc[:, -1].values[0] 
    cell_label = pd.read_csv(loc, index_col=0)
    cell_label_dict[i] = cell_label

In [None]:
# get the SuperSCC prediction results
pred_res_loc = list_files(path="/home/fengtang/jupyter_notebooks/working_script/label_transfer/SuperSCC/train_on_multi_datasets", pattern=".+prediction_result.+")

pred_dict = dict()
for i in pred_res_loc:
    name = basename(i)
    name = re.findall("(.+)_svm.+", name)[0]
    pred_dict[name] = pd.read_pickle(i)
    
# tidy up SuperSCC results
superscc_dict = dict()

for i in pred_dict.keys():
    key = list(pred_dict[i].keys())[0]
    y_pred = pred_dict[i][key]["prediction"]
    y_true = cell_label_dict[i].cell_type.values

    superscc_dict[i] = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})

In [9]:
# get the SingleR prediction results
singler_pred_res_los = list_files(path = "/mnt/disk5/zhongmin/superscc/label_transfer/代码/train_on_multi_datasets/SingleR-train-on-multi-datasets", pattern = ".+prediction.csv")

singler_dict = dict()

for i in singler_pred_res_los:
    name = basename(i)
    name = re.findall("(.+)_SingleR.+", name)[0]
    singler_dict[name] = pd.read_csv(i)

for i in singler_dict.keys():
    path = file_loc.loc[file_loc.数据集 == i].loc[:, "finest_cell_type"].values[0]
    y_true = pd.read_csv(path, index_col = 0)
    y_true.columns = ["y_true"]
    singler_dict[i] = pd.concat([singler_dict[i], y_true], axis = 1)

In [24]:
# get the SingleCellNet prediction result
scn_pred_res_loc = list_files(path = "/mnt/disk5/zhongmin/superscc/label_transfer/代码/train_on_multi_datasets/SingleCellNet-train-on-multi-datasets/", pattern = ".+raw.csv")

scn_dict = dict()

for i in scn_pred_res_loc:
    name = basename(i)
    name = re.findall("(.+)_SingleCellNet.+", name)[0]
    intermediate = pd.read_csv(i, index_col=0)
    intermediate = pd.DataFrame(intermediate.idxmax(axis = 1), columns=["y_pred"])
    intermediate.index = range(intermediate.shape[0])
    scn_dict[name] = intermediate

for i in scn_dict.keys():
    path = file_loc.loc[file_loc.数据集 == i].loc[:, "finest_cell_type"].values[0]
    y_true = pd.read_csv(path, index_col = 0)
    y_true.columns = ["y_true"]
    scn_dict[i] = pd.concat([scn_dict[i], y_true], axis = 1)

In [None]:
# get the Seurat prediction result
seurat_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/Seurat", pattern = ".+on_multi_datasets.csv")

seurat_dict = dict()

for i in seurat_res_loc:
    name = basename(i)
    name = re.findall("(.+)_seurat.+", name)[0]
    print(name)
    intermediate = pd.read_csv(i, index_col=0)
    intermediate.rename(columns={"predicted.id":"y_pred"}, inplace=True)
    y_true_loc = file_loc.loc[file_loc.数据集 == name, "finest_cell_type"].values[0]
    y_true = pd.read_csv(y_true_loc)
    intermediate.loc[:, "y_true"] = y_true.cell_type.tolist()
    seurat_dict[name] = intermediate

In [None]:
# get the scmap prediction result
scmap_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/scmap", pattern = ".+on_multi_datasets.csv")

scmap_dict = dict()

for i in scmap_res_loc:
    name = basename(i)
    name = re.findall("(.+)scmap.+", name)[0]
    print(name)
    intermediate = pd.read_csv(i, index_col=0)
    intermediate.rename(columns={"prediction":"y_pred"}, inplace=True)
    y_true_loc = file_loc.loc[file_loc.数据集 == name, "finest_cell_type"].values[0]
    y_true = pd.read_csv(y_true_loc)
    intermediate.loc[:, "y_true"] = y_true.cell_type.tolist()
    scmap_dict[name] = intermediate

In [None]:
# get the scANVI prediction result
scANVI_res_loc = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/label_transfer/scANVI/", pattern = ".+on_multi_datasets.pkl")

scANVI_dict = dict()

for i in scANVI_res_loc:
    name = basename(i)
    name = re.findall("(.+)_scANVI.+", name)[0]
    print(name)
    intermediate = pd.read_pickle(i)
    intermediate = intermediate.obs
    intermediate = intermediate.loc[intermediate.ref_or_not == "test", ["C_scANVI"]]
    intermediate.rename(columns={"C_scANVI":"y_pred"}, inplace=True)
    y_true_loc = file_loc.loc[file_loc.数据集 == name, "finest_cell_type"].values[0]
    y_true = pd.read_csv(y_true_loc)
    intermediate.loc[:, "y_true"] = y_true.cell_type.tolist()
    scANVI_dict[name] = intermediate

In [None]:
superscc_score = scoring(superscc_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
superscc_score.loc[:, "method"] = "SuperSCC"
superscc_score.to_csv("superscc_score_on_multi_datasets_and_finest_cell_label.csv")

singler_score = scoring(singler_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
singler_score.loc[:, "method"] = "SingleR"
singler_score.to_csv("singler_score_on_multi_datasets_and_finest_cell_label.csv")

scn_score = scoring(scn_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scn_score.loc[:, "method"] = "SingleCellNet"
scn_score.to_csv("scn_score_on_multi_datasets_and_finest_cell_label.csv")


scmap_score = scoring(scmap_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scmap_score.loc[:, "method"] = "Scmap"
scmap_score.to_csv("scmap_score_on_multi_datasets_and_finest_cell_label.csv")

seurat_score = scoring(seurat_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
seurat_score.loc[:, "method"] = "Seurat"
seurat_score.to_csv("seurat_score_on_multi_datasets_and_finest_cell_label.csv")

scANVI_score = scoring(scANVI_dict, methods=[accuracy_score, homogeneity_score, f1_score, precision_score, recall_score, completeness_score, v_measure_score, matthews_corrcoef], average = "weighted")
scANVI_score.loc[:, "method"] = "scANVI"
scANVI_score.to_csv("scANVI_score_on_multi_datasets_and_finest_cell_label.csv")