In [1]:
import os
import pandas as pd
from SuperSCC import *
import numpy as np

from sklearn.metrics import adjusted_mutual_info_score, rand_score, adjusted_rand_score, normalized_mutual_info_score, accuracy_score, fowlkes_mallows_score
from os.path import basename

from scipy.stats import gmean, rankdata

In [2]:
def weighted_geometric_mean_numpy(values, weight_num = 0.5):
    # max_value = np.max(values)
    # max_index = np.where(np.array(values) == max_value)[0][0]
    # values = np.array(values)
    # weights = [(1 - weight_num)/(len(values)-1) for i in range(len(values))]
    # weights[max_index] = weight_num
    weights = rankdata(values) / np.sum(rankdata(values))
    return gmean(values, weights = weights)

In [31]:
def evaulate_clustering(data, scoring_method, sc3_level, seurat_level):

    df_list = list()
    
    dataset_keys = data[0].keys()

    for dataset in dataset_keys:
        print(dataset)

        y_true = data[0][dataset]
        superscc_pred = [str(i) for i in data[1][dataset]]
        sc3_pred = data[2][dataset].iloc[:, sc3_level].astype(str).tolist()
        seurat_pred = data[3][dataset].loc[:, f"Cluster_res_{seurat_level}"].astype(str).tolist()
        scanpy_pred = data[4][dataset]
        cidr_pred = data[5][dataset].Cluster.astype(str).tolist()
        sccaf_pred = data[6][dataset]["sccaf"].astype(str).tolist()
        scshc_pred = data[7][dataset].clusters.astype(str).tolist()
        sclca_pred = data[8][dataset].cluster.astype(str).tolist()
        monocle_pred = data[9][dataset].monocle3_cluster.astype(str).tolist()

        y_pred = [superscc_pred, sc3_pred, seurat_pred, scanpy_pred, cidr_pred, sccaf_pred, scshc_pred, sclca_pred, monocle_pred]
        for key, method in scoring_method.items():
            df = pd.DataFrame(columns = ["dataset", "score_method", "score", "method"])
            method_original = [key for i in range(len(data) - 1)]

            score_original = list()
            score_list = list()
            for i in y_pred:
                score = method(y_true, i)
                score_list.append(score)
            score_original.extend(score_list)

            df.score = score_original
            df.method = ["SuperSCC", "SC3", "Seurat", "Scanpy", "CIDR", "Sccaf", "Scshc", "Sclca", "Monocle"]
            df.score_method = method_original
            df.dataset = [dataset for i in range(len(data) - 1)]
            df_list.append(df)
            
    return pd.concat(df_list)
        

In [4]:
def evaulate_clustering2(data, scoring_method, scanpy_level, seurat_level):

    df_list = list()
    
    dataset_keys = data[0].keys()

    for dataset in dataset_keys:
        print(dataset)

        y_true = data[0][dataset]
        superscc_pred = [str(i) for i in data[1][dataset]]
        seurat_pred = data[2][dataset].loc[:, f"Cluster_res_{seurat_level}"].astype(str).tolist()
        scanpy_pred = data[3][dataset].loc[:, f"leiden_{scanpy_level}"].astype(str).tolist()
      

        y_pred = [superscc_pred, seurat_pred, scanpy_pred]
        for key, method in scoring_method.items():
            df = pd.DataFrame(columns = ["dataset", "score_method", "score", "method"])
            method_original = [key for i in range(3)]

            score_original = list()
            score_list = list()
            for i in y_pred:
                score = method(y_true, i)
                score_list.append(score)
            score_original.extend(score_list)

            df.score = score_original
            df.method = ["SuperSCC",  f"Seurat_{seurat_level}", f"Scanpy_{scanpy_level}"]
            df.score_method = method_original
            df.dataset = [dataset for i in range(3)]

            df_list.append(df)
            
    return pd.concat(df_list)
        

In [5]:
def get_loc(string, num):
    return re.split("/", string)[num]

In [6]:
# read the data location file in
data_loc = pd.read_csv("/mnt/disk5/zhongmin/superscc/结果位置/结果位置_3.csv", index_col=0, encoding = "GBK")

In [7]:
# get the SC3 results
# SC3_res_loc = list_files(pattern="col_data.csv", path = "/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/SC3结果/", full_name=True, recursive=True)
# SC3_res_loc_sub = np.array(sorted(SC3_res_loc))[[ True if i % 2 != 0 else False for i in range(len(SC3_res_loc))]]
SC3_res = [*map(lambda x: pd.read_csv(x, index_col=0), data_loc.SC3)]

SC3_dict = dict()
for idx, i in enumerate(data_loc.SC3):
    if re.search(".+CIDR_结果.+", i) and re.search(".+49万拆出的.+", i):
        i = get_loc(i, num = 8)
    elif re.search(".+胚胎.+", i ):
        i = get_loc(i, num = 6)
    elif re.search(".+食管.癌.+", i):
        i = get_loc(i, num = 5)
    else:
        i = get_loc(i, num = 7)
    SC3_dict[i] = SC3_res[idx]

In [8]:
# get the Seurat results 
# seurat_res_loc = list_files(pattern=".+csv$", path = "/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/seurat结果/", full_name=True)
# seurat_res_loc = sorted(seurat_res_loc)
# seurat_res_loc.pop(12)
# seurat_res_loc_sub = np.array(seurat_res_loc)[[False if i % 2 == 0 else True for i in range(len(seurat_res_loc))]]

seurat_res_loc = data_loc.seurat
seurat_res_loc[3] = "/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/seurat结果/Barbry_Leroy_2020/2024-09-02/task1/Barbry_Leroy_metadata.csv"
seurat_res = [*map(lambda x: pd.read_csv(x, index_col=0) if x.endswith("csv") else pd.read_csv(x + ".csv"), seurat_res_loc)]

seurat_dict = dict()
for idx, i in enumerate(seurat_res_loc):
    if re.search(".+seurat_结果.+", i) and re.search(".+49万拆出的.+", i):
        i = get_loc(seurat_res_loc[idx], num = 8)
    elif re.search(".+胚胎.+", i ):
        i = get_loc(seurat_res_loc[idx], num = 6)
    elif re.search(".+食管.癌.+", i):
        i = get_loc(seurat_res_loc[idx], num = 5)
    else:
        i = get_loc(seurat_res_loc[idx], num = 7)
    seurat_dict[i] = seurat_res[idx]    

  seurat_res_loc[3] = "/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/seurat结果/Barbry_Leroy_2020/2024-09-02/task1/Barbry_Leroy_metadata.csv"
  i = get_loc(seurat_res_loc[idx], num = 8)
  i = get_loc(seurat_res_loc[idx], num = 7)
  i = get_loc(seurat_res_loc[idx], num = 6)
  i = get_loc(seurat_res_loc[idx], num = 5)


In [9]:
# get the scanpy results
scanpy_res_loc = data_loc.scanpy
scanpy_res = [*map(lambda x: pd.read_csv(x, index_col=0) if x.endswith("csv") else pd.read_csv(x + ".csv"), scanpy_res_loc)]
names = scanpy_res_loc.index.to_list()

scanpy_dict = dict()
for idx, i in enumerate(scanpy_res_loc):
    scanpy_dict[names[idx]] = scanpy_res[idx]  


ls = list()
for i in scanpy_dict:
    if i == "49万分的pbmc":
        scanpy_dict["血液"] = scanpy_dict[i]
        ls.append(i)
        break

for i in scanpy_dict:
    if i == "49万分的大肠":
        scanpy_dict["大肠"] = scanpy_dict[i]
        ls.append(i)
        break

for i in scanpy_dict:    
    if i == "食管鳞癌":
        scanpy_dict["GSE160269_食管鳞癌"] = scanpy_dict[i]
        ls.append(i)
        break
        
for i in scanpy_dict:
    if i == "肺数据集全部取的两万个":
        scanpy_dict["所有细胞分2万"] = scanpy_dict[i]
        ls.append(i)
        break

for i in ls:
    scanpy_dict.pop(i)

In [10]:
# get the CIDR results
# cidr_res_loc = list_files(pattern=".+csv$", path = "/mnt/disk5/zhongmin/superscc/师兄整理的肺数据/CIDR结果", full_name=True)
# cidr_res_loc = np.array(cidr_res_loc)[[1, 3, 5, 6, 8, 10, 13, 16, 18, 20]]

cidr_res_loc = data_loc.CIDR
cidr_res = [*map(lambda x: pd.read_csv(x, index_col=0), cidr_res_loc)]

cidr_dict = dict()
for idx, i in enumerate(cidr_res_loc):
    if re.search(".+CIDR_结果.+", i) and re.search(".+49万拆出的.+", i):
        i = get_loc(i, num = 8)
    elif re.search(".+胚胎.+", i ):
        i = get_loc(i, num = 6)
    elif re.search(".+食管.癌.+", i):
        i = get_loc(i, num = 5)
    else:
        i = get_loc(i, num = 7)
    cidr_dict[i] = cidr_res[idx] 

In [11]:
# get the SuperSCC results
superscc_res_pre_loc = data_loc.superscc
superscc_res_loc = list()
for i in superscc_res_pre_loc:
    files = list_files(pattern=".+pkl$", path = i, full_name = True, recursive = False)
    superscc_res_loc.extend(files)

superscc_res = [*map(lambda x: pd.read_pickle(x), sorted(superscc_res_loc))]

R_dict = dict()
M_dict = dict()
F_dict = dict()
feature_dict = dict()
cluster_dict = dict()

select = list()
intermediate = dict()
for idx, i in enumerate(sorted(superscc_res_loc)):
    if re.search("师兄整理的肺数据", i):
        i = get_loc(i, num = 7)
    elif re.search(".+胚胎.+", i ):
        i = get_loc(i, num = 6)
    elif re.search(".+食管.癌.+", i):
        i = get_loc(i, num = 5)
    elif re.search("49万拆出的", i):
        i = get_loc(i, num = 8)

    if i not in select:
        select.append(i)

    if idx % 2 == 0:
        i = "consensus_" + i
    else:
        i = "sub_consensus_" + i

    intermediate[i] = superscc_res[idx]


for i in select:
    key1 = "consensus_" + i
    key2 = "sub_consensus_" + i

    R_dict[i] = intermediate[key1]["global_cluster_before_merging"]
    M_dict[i] = intermediate[key1]["global_cluster_after_merging"]["labels"]
    F_dict[i] = intermediate[key2]["merge_labels"]
    feature_dict[i] = pd.DataFrame({"V1": [i[0] for i in intermediate[key1]["global_features_before_merging"]["features"]["final_feature_selection_by_ensemble"]]})
    cluster_dict[i] = pd.DataFrame({"V1": intermediate[key1]["global_cluster_after_merging"]["labels"]})

In [12]:
# get the scLCA results
sclca_files = list_files(path = "/mnt/disk5/zhongmin/superscc/结果位置/scLCA_results", pattern = ".+csv$", recursive = False)
file_name = [re.sub("_scLCA_results.csv", "", basename(i)) for i in sclca_files]
sclca_res = list()

for i in sclca_files:
    sclca_res.append(pd.read_csv(i))


sclca_dict = dict()
for idx, i in enumerate(sclca_res):
    sclca_dict[file_name[idx]] = i


ls = list()
for i in sclca_dict:
    if i == "49万分的pbmc":
        sclca_dict["血液"] = sclca_dict[i]
        ls.append(i)
        break

for i in sclca_dict:
    if i == "49万分的大肠":
        sclca_dict["大肠"] = sclca_dict[i]
        ls.append(i)
        break

for i in sclca_dict:    
    if i == "食管鳞癌":
        sclca_dict["GSE160269_食管鳞癌"] = sclca_dict[i]
        ls.append(i)
        break
        
for i in sclca_dict:
    if i == "肺数据集全部取的两万个":
        sclca_dict["所有细胞分2万"] = sclca_dict[i]
        ls.append(i)
        break

for i in ls:
    sclca_dict.pop(i)

In [25]:
# get the monocle results
monocle_files = list_files(path = "/mnt/disk5/zhongmin/superscc/结果位置/monocle3/monocle3_results/csv", pattern = ".+csv$", recursive = False)
file_name = [re.sub("_clusters.csv", "", basename(i)) for i in monocle_files]
monocle_res = list()

for i in monocle_files:
    monocle_res.append(pd.read_csv(i))


monocle_dict = dict()
for idx, i in enumerate(monocle_res):
    monocle_dict[file_name[idx]] = i


ls = list()
for i in monocle_dict:
    if i == "49万分的pbmc":
        monocle_dict["血液"] = monocle_dict[i]
        ls.append(i)
        break

for i in monocle_dict:
    if i == "49万分的大肠":
        monocle_dict["大肠"] = monocle_dict[i]
        ls.append(i)
        break

for i in monocle_dict:    
    if i == "食管鳞癌":
        monocle_dict["GSE160269_食管鳞癌"] = monocle_dict[i]
        ls.append(i)
        break
        
for i in monocle_dict:
    if i == "肺数据集全部取的两万个":
        monocle_dict["所有细胞分2万"] = monocle_dict[i]
        ls.append(i)
        break

for i in ls:
    monocle_dict.pop(i)

In [26]:
# get the sscaf result
data_loc.iloc[2, 21] = '/mnt/disk5/zhongmin/superscc/结果位置/sccaf/Banovich_Kropski_2020数据_sccaf.csv'
sccaf_res_loc = data_loc.sccaf


sccaf_res = [*map(lambda x: pd.read_csv(x, index_col=0), sccaf_res_loc)]

sccaf_dict = dict()
for idx, i in enumerate(sccaf_res_loc):
    sccaf_dict[sccaf_res_loc.index.values[idx]] = sccaf_res[idx] 

ls = list()
for i in sccaf_dict:
    if i == "49万分的pbmc":
        sccaf_dict["血液"] = sccaf_dict[i]
        ls.append(i)
        break

for i in sccaf_dict:
    if i == "49万分的大肠":
        sccaf_dict["大肠"] = sccaf_dict[i]
        ls.append(i)
        break

for i in sccaf_dict:    
    if i == "食管鳞癌":
        sccaf_dict["GSE160269_食管鳞癌"] = sccaf_dict[i]
        ls.append(i)
        break
        
for i in sccaf_dict:
    if i == "肺数据集全部取的两万个":
        sccaf_dict["所有细胞分2万"] = sccaf_dict[i]
        ls.append(i)
        break

for i in ls:
    sccaf_dict.pop(i)

In [27]:
# get the sc-SHC result
scshc_res_loc = data_loc.sc_SHC
scshc_res = [*map(lambda x: pd.read_csv(x, index_col=0), scshc_res_loc)]
scshc_dict = dict()

for idx, i in enumerate(scshc_res_loc):
    data = scshc_res[idx] 
    data.columns = ["clusters"]
    scshc_dict[scshc_res_loc.index.values[idx]] = data


ls = list()
for i in scshc_dict:
    if i == "49万分的pbmc":
        scshc_dict["血液"] = scshc_dict[i]
        ls.append(i)
        break

for i in scshc_dict:
    if i == "49万分的大肠":
        scshc_dict["大肠"] = scshc_dict[i]
        ls.append(i)
        break

for i in scshc_dict:    
    if i == "食管鳞癌":
        scshc_dict["GSE160269_食管鳞癌"] = scshc_dict[i]
        ls.append(i)
        break
        
for i in scshc_dict:
    if i == "肺数据集全部取的两万个":
        scshc_dict["所有细胞分2万"] = scshc_dict[i]
        ls.append(i)
        break

for i in ls:
    scshc_dict.pop(i)

In [28]:
# get the original annotation for each dataset
orig_finest = dict()
orig_lv1 = dict()
orig_lv2 = dict()
orig_lv3 = dict()
orig_lv4 = dict()

for i in SC3_dict.keys():
    try:
        orig_finest[i] = seurat_dict[i].ann_finest_level.tolist()
        orig_lv1[i] = seurat_dict[i].ann_level_1.tolist()
        orig_lv2[i] = seurat_dict[i].ann_level_2.tolist()
        orig_lv3[i] = seurat_dict[i].ann_level_3.tolist()
        orig_lv4[i] = seurat_dict[i].ann_level_4.tolist()
    except:
        if i == "血液": 
            file = pd.read_csv("/mnt/disk5/zhongmin/superscc/49万拆出的/sup/未去批次效应数据/正常血液组织数据2万_metadata_整理细胞类型.csv", index_col=0)
            orig_lv2[i] = file.loc[:, "level1_celltype"].tolist()
        elif i == "大肠":
            file = pd.read_csv("/mnt/disk5/zhongmin/superscc/49万拆出的/sup/未去批次效应数据/正常大肠组织数据2万_metadata_整理细胞类型.csv", index_col=0)
            orig_lv2[i] = file.loc[:, "level1_celltype"].tolist()
        elif i == "D034":
            file = pd.read_csv("/mnt/disk5/zhongmin/superscc/胚胎数据/D034/未去批次效应数据/没有去除批次效应_2万所有数据metadata_2_整理细胞类型.csv", index_col=0)
            orig_lv2[i] = file.loc[:, "level1_celltype"].tolist()
        elif i == "GSE160269_食管鳞癌":
            orig_lv2[i] = seurat_dict[i].annotated_type.tolist()
        elif i == "D009":
            orig_lv2[i] = seurat_dict[i].cell_type.tolist()
        elif i == "D022":
            orig_lv2[i] = seurat_dict[i].broad_celltype.tolist()
        elif i == "GSE136831_Kaminski_2020":
            orig_lv2[i] = seurat_dict[i].CellType_Category.tolist()
        elif i == "GSE161382_Sun_2020":
            orig_lv2[i] = seurat_dict[i].lineage.tolist()

In [29]:
# check all results generated by different methods on different datasets are read in 
for i in (R_dict.keys(), M_dict.keys(), F_dict.keys(), seurat_dict.keys(), SC3_dict.keys(), cidr_dict.keys(), sccaf_dict.keys(), orig_lv2.keys(), scshc_dict.keys(), sclca_dict.keys(), monocle_dict.keys()):
    print(len(i))

18
18
18
18
18
18
18
18
18
18
18


In [None]:
# get the ARI, AMI, FMI, NMI scores per dataset per method
res = evaulate_clustering([orig_lv2, M_dict, SC3_dict, seurat_dict, R_dict, cidr_dict, sccaf_dict, scshc_dict, sclca_dict, monocle_dict], scoring_method = {"ARI": adjusted_rand_score, "NMI": normalized_mutual_info_score, "AMI": adjusted_mutual_info_score, "FMI": fowlkes_mallows_score} , sc3_level = 1, seurat_level = 0.8)
res.to_csv(f"/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/3rd_submission/lv2_clustering_performance_3nd_submission_{record_time()}.csv")

In [52]:
# get the ARI, AMI, FMI, NMI scores per dataset for Seurat and Scanpy clusterings under resolution rang from 0.1 to 1
res_list = list()
for resolution in np.linspace(0.1, 1, 10):
    if np.isclose(resolution, 1) == False:
        resolution = np.around(resolution, 1)
        df = evaulate_clustering2([orig_lv2, M_dict,  seurat_dict, scanpy_dict], scoring_method = {"ARI": adjusted_rand_score, "NMI": normalized_mutual_info_score, "AMI": adjusted_mutual_info_score, "FMI": fowlkes_mallows_score} , scanpy_level = resolution, seurat_level = resolution)
        df.loc[:, "resolution"] = resolution
        res_list.append(df)
    else:
        resolution_1 = 1
        resolution_2 = 1.0
        df = evaulate_clustering2([orig_lv2, M_dict,  seurat_dict, scanpy_dict], scoring_method = {"ARI": adjusted_rand_score, "NMI": normalized_mutual_info_score, "AMI": adjusted_mutual_info_score, "FMI": fowlkes_mallows_score} , scanpy_level = resolution_2, seurat_level = resolution_1)
        df.loc[:, "resolution"] = resolution
        res_list.append(df)

combined_res = pd.concat(res_list, axis = 0)
# combined_res_sub = combined_res.loc[combined_res.method != "SuperSCC", :]
# combined_res_sub.loc[:, "method"] = [re.findall("[^_]+", i)[0] for i in combined_res_sub.method.tolist()]
# combined_res_group = combined_res_sub.groupby(["dataset", "score_method", "method"])

# gmean_res = list()
# for group in combined_res_group.groups.keys():
#     data = combined_res_group.get_group(group)
#     score = data.score.tolist()
#     average = weighted_geometric_mean_numpy(score)

#     dataset = data.dataset.unique()[0]
#     score_method = data.score_method.unique()[0]
#     method = data.method.unique()[0]

#     gmean_res.append(pd.DataFrame({"dataset": dataset, "score_method": score_method, "method": method, "score": average}, index=[0]))

# gmean_res = pd.concat(gmean_res, axis = 0)
# gmean_res.index = range(gmean_res.shape[0])

# superscc_output = combined_res.loc[combined_res.method == "SuperSCC", :].sort_values(["dataset", "score_method"]).drop_duplicates(keep = "first")
# gmean_res = pd.concat([gmean_res, superscc_output], axis = 0)

# gmean_res.to_csv(f"/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/3rd_submission/Aggregate_mean_scanpy_seurat_under_different_resolution_{record_time()}.csv")

combined_res.to_csv(f"/home/fengtang/jupyter_notebooks/working_script/evulate_clustering/3rd_submission/Aggregate_mean_scanpy_seurat_under_different_resolution_{record_time()}.csv")

血液
大肠
Banovich_Kropski_2020
Barbry_Leroy_2020
D009
D022
D034
GSE136831_Kaminski_2020
GSE161382_Sun_2020
Krasnow_2020
Lafyatis_Rojas_2019
Meyer_2019
Misharin_2021
Misharin_Budinger_2018
Nawijn_2021
Teichmann_Meyer_2019
所有细胞分2万
GSE160269_食管鳞癌
血液
大肠
Banovich_Kropski_2020
Barbry_Leroy_2020
D009
D022
D034
GSE136831_Kaminski_2020
GSE161382_Sun_2020
Krasnow_2020
Lafyatis_Rojas_2019
Meyer_2019
Misharin_2021
Misharin_Budinger_2018
Nawijn_2021
Teichmann_Meyer_2019
所有细胞分2万
GSE160269_食管鳞癌
血液
大肠
Banovich_Kropski_2020
Barbry_Leroy_2020
D009
D022
D034
GSE136831_Kaminski_2020
GSE161382_Sun_2020
Krasnow_2020
Lafyatis_Rojas_2019
Meyer_2019
Misharin_2021
Misharin_Budinger_2018
Nawijn_2021
Teichmann_Meyer_2019
所有细胞分2万
GSE160269_食管鳞癌
血液
大肠
Banovich_Kropski_2020
Barbry_Leroy_2020
D009
D022
D034
GSE136831_Kaminski_2020
GSE161382_Sun_2020
Krasnow_2020
Lafyatis_Rojas_2019
Meyer_2019
Misharin_2021
Misharin_Budinger_2018
Nawijn_2021
Teichmann_Meyer_2019
所有细胞分2万
GSE160269_食管鳞癌
血液
大肠
Banovich_Kropski_2020
Barbry_L

In [55]:
combined_res.method.unique()

array(['SuperSCC', 'Seurat_0.1', 'Scanpy_0.1', 'Seurat_0.2', 'Scanpy_0.2',
       'Seurat_0.3', 'Scanpy_0.3', 'Seurat_0.4', 'Scanpy_0.4',
       'Seurat_0.5', 'Scanpy_0.5', 'Seurat_0.6', 'Scanpy_0.6',
       'Seurat_0.7', 'Scanpy_0.7', 'Seurat_0.8', 'Scanpy_0.8',
       'Seurat_0.9', 'Scanpy_0.9', 'Seurat_1', 'Scanpy_1.0'], dtype=object)