In [1]:
import pandas as pd
from SuperSCC import *
import numpy as np
import mygene
import scanpy as sc

from os.path import basename
import re
import os
from copy import deepcopy

In [2]:
os.chdir("/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/3rd_submission/")

In [35]:
def ensemblID2Symbol(id_list: list[str])-> list:
    """
    A function to convert Ensembl gene ids to gene symbols.

    Parameters
    ----------
    id_list:
        A list containing ensembl ids. e.g. ['ENSG00000139618', 'ENSG00000225972', 'ENSG00000186092']

    Return
    ------
        gene symbols for the query gene ids. e.g. ['BRCA2', 'MTND1P23', 'OR4F5']
    """
    mg = mygene.MyGeneInfo()
    symbols = mg.querymany(id_list, scopes='ensembl.gene', fields='symbol', species='human')
   
    res = list()
    for i in symbols:
        try:
            res.append(i["symbol"])
        except:
            res.append(i["query"])
    return res


In [None]:
# # get the SuperSCC markers for each cell type on dataset Habermann_2020

file =  pd.read_csv("/mnt/disk5/zhongmin/superscc/结果位置/结果位置_3.csv", encoding = "GBK", index_col = 0)

# get the expression matrix
expression = pd.read_csv(file.iloc[:, 9][2])

# get the cell type annotation meta
meta = pd.read_csv(file.iloc[:, 6][2])

# insert cell type column into expression matrix
expression.loc[:, "cell_type"] = meta.ann_level_4.tolist()

# remove cells with NaN cell labels
expression = expression.loc[expression.cell_type.isna() == False, :]

cell_type = expression.cell_type.tolist()
expression = expression.select_dtypes("number")

# do normalization 
norm_exp = pre_processing(expression)

# insert cell type column into normalized expression matrix
norm_exp.loc[:, "cell_type"] = cell_type

# run SuperSCC to get markers of cell type
res = find_markers_ovr(norm_exp, label_column = "cell_type", filename = "Habermann_2020_2025_06_25", save = True, n_jobs = 20)

# load SuperSCC markers
# res = pd.read_pickle("../2nd_submssion/compare_with_cell_marker_db/Habermann_2020_2025_06_25_2025-06-26 14:56:33.pkl")

In [5]:
# retrieve the SuperSCC positive markers 
superscc_data = {"superscc": deepcopy(res)}

superscc_data2 = dict()

for i in superscc_data:
    
    for j in superscc_data[i]:
        info_features = [z[0] for z in superscc_data[i][j]["features"]["final_feature_selection_by_ensemble"]]
        expression = superscc_data[i][j]["sub_high_expression_genes"][j]
        expression = expression.loc[expression.feature.isin(info_features), :]
        
        expression.loc[:, "log2_fold_change"] = np.log2(expression.expression1 / expression.expression2)
        expression = expression.loc[((expression.expression1 > 1) & (1 - (expression.pct2/expression.pct1) > 0.5)) | (np.isnan(expression.pct2)) ].sort_values("score", ascending = False)
        
        try:
            superscc_data2[i].update({j: ensemblID2Symbol(expression.feature[0:20].tolist())})
        except:
            try:
                superscc_data2[i] = {j: ensemblID2Symbol(expression.feature[0:20].tolist())}
            except:
                pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expression.loc[:, "log2_fold_change"] = np.log2(expression.expression1 / expression.expression2)
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expression.loc[:, "log2_fold_change"] = np.log2(expression.expression1 / expression.expression2)
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string for

In [None]:
# get the Scanpy markers
file = pd.read_csv("/mnt/disk5/zhongmin/superscc/结果位置/结果位置_3.csv", encoding = "GBK", index_col = 0)

# get the expression matrix
adata = sc.read_csv(file.iloc[:, 9][2])
adata.obs.loc[:, "cell_type"] = pd.read_csv(file.iloc[:, 6][2]).ann_level_4.tolist()

# do normalization
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# detect DE genes
sc.tl.rank_genes_groups(adata, groupby = "cell_type")

# tidy up DE genes
scanpy_markers = dict()

for i in adata.obs.cell_type.unique().tolist():
    try:
        scanpy_markers[i] = ensemblID2Symbol(sc.get.rank_genes_groups_df(adata, pval_cutoff = 0.05, log2fc_min = 1, group = i).names[0:20].tolist())
    except:
        continue


In [None]:
# get the Seurat markers
file = pd.read_csv("/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/3rd_submission/seurat_de_genes_on_dataset_Habermann_2020.csv", encoding = "GBK")

group_file = file.groupby("cluster")

seurat_markers = dict()
for i in group_file.groups.keys():
    data = group_file.get_group(i)
    data = data.loc[(data.p_val_adj < 0.05) & (data.avg_log2FC >= 0.5), :]
    data = data.sort_values("p_val_adj", ascending = True).head(20).gene.tolist()
    data = ensemblID2Symbol(data)
    seurat_markers[i] = data

In [41]:
#  get the cell marker database 
cellmarker_db = pd.read_excel("/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/compare_with_cell_marker_db/Cell_marker_Human.xlsx")

# only retain lung marker sets
cellmarker_db_lung = cellmarker_db.loc[cellmarker_db.tissue_type == "Lung"]

In [83]:
# match labels between dataset Habermann_2020 and cellmarker db
cell_type_match = {
                   "CD8 T cells": "CD8+ T cell",
                   "CD4 T cells": "CD4+ T cell",
                   "Interstitial macrophages": "Interstitial macrophage",
                   "Alveolar macrophages": "Alveolar macrophage",
                   "Club": "Club cell (Clara cell)",
                   "Goblet": 'Goblet cell',
                   "DC2": "Dendritic cell",
                   "DC1": "Dendritic cell",
                   "Ionocyte": "Ionocyte",
                   "Plasma cells": "Plasma cell",
                   "B cells": 'B cell',
                   "Basal resting": "Basal cell", 
                   "Pericytes": "Pericyte",
                   "Transitional Club-AT2": "Alveolar cell Type 2",
                   "NK cells": "Natural killer cell",
                   "Neuroendocrine": "Neuroendocrine cell",
                   'Non-classical monocytes': "Monocyte",
                   "Adventitial fibroblasts": "Fibroblast",
                   "Alveolar fibroblasts": "Fibroblast-like cell",
                   "EC general capillary": "Capillary cell"
                   }

In [99]:
# get the insersection for cell type markers between dataset Habermann_2020 and cellmarker db

ls = list()
for i in cell_type_match:
    
    superscc_markers = superscc_data2["superscc"][i]
    scanpy_res = scanpy_markers[i]
    if i == "Neuroendocrine":
        seurat_res = ["NULL"]
    else: 
        seurat_res = seurat_markers[i]

    cell_type = cell_type_match[i]
    cell_db_markers = cellmarker_db_lung.loc[cellmarker_db_lung.cell_name == cell_type].marker.unique()

    intersection = set(superscc_markers).intersection(cell_db_markers)

    scanpy_intersection = set(cell_db_markers).intersection(scanpy_res)
    scanpy_intersection = ", ".join(scanpy_intersection)

    seurat_intersection = set(cell_db_markers).intersection(seurat_res)

    if len(seurat_intersection) == 0:
        seurat_intersection = "NULL"
    else:
        seurat_intersection = ", ".join(seurat_intersection)


    if(len(intersection) == 0):
        intersection = "NULL"
    else:
        intersection = ", ".join(intersection)
    
    cell_db_markers = ", ".join(set(cell_db_markers))
    cell_db_cell_type = cell_type_match[i]

    superscc_markers = ", ".join(superscc_markers)
    superscc_cell_type = i

    df = pd.DataFrame({
        "Cell_marker_db_cell_label": cell_db_cell_type,
        "Cell_marker_db_markers": cell_db_markers,
        "Cell_label_on_the_test_dataset": superscc_cell_type,
        "SuperSCC_markers": superscc_markers,
        "SuperSCC_intersection": intersection,
        "Scanpy_markers": ", ".join(scanpy_res),
        "Scanpy_intersection": scanpy_intersection,
        "Seurat_markers": ", ".join(seurat_res),
        "Seurat_intersection": seurat_intersection
    },
    index=[0]
    )

    ls.append(df)
    

In [100]:
res2 = pd.concat(ls)
res2.index = range(res2.shape[0])
res2

Unnamed: 0,Cell_marker_db_cell_label,Cell_marker_db_markers,Cell_label_on_the_test_dataset,SuperSCC_markers,SuperSCC_intersection,Scanpy_markers,Scanpy_intersection,Seurat_markers,Seurat_intersection
0,CD8+ T cell,"AGMAT, GPRASP1, EPHX2, CD69, AMIGO1, CXCR6, CD...",CD8 T cells,"NKG7, CCL4, CCL5, GNLY, IL32, DUSP2, CST7, CD3...","CD3D, CD3E, CCL5, CD8A","CCL5, NKG7, IL32, CD3E, CST7, B2M, IFITM1, HLA...","CD3E, CD3D, CCL5, GZMA","IL32, GZMM, CD8B, ZAP70, TRGV10, GNG2, FGFBP2,...","CD3D, CD8B"
1,CD4+ T cell,"IL7R, IL2RA, CD4, FOXP3, CD3E, CD3D, CD8A, CCR...",CD4 T cells,"IL32, IL7R, CCL5, CD69, CORO1A, CD3E, CXCR4, C...","CD3E, CD3D, CD8A, IL7R","CD3E, IL32, ZFP36L2, BTG1, CD69, IL7R, SARAF, ...","CD3E, CD3D, IL7R","RPS27A, RPS3, CD3D, CD2, ACAP1, SPOCK2, KLRB1,...",CD3D
2,Interstitial macrophage,"CCL4, ISG20, ISG15, CXCL10, CXCL11",Interstitial macrophages,"CD68, C1QA, CAPG, MARCO, C1QB, MRC1, PLAUR, MC...",,"TYROBP, CD68, FTH1, FCER1G, VIM, CTSD, FTL, CT...",,"TNFSF13, CXCL3, PPT1, NICOL1, CD68, MRC1, MS4A...",
3,Alveolar macrophage,"CD68, APOE, APOC1, FBP1, INHBA, MCEMP1, LYZ, G...",Alveolar macrophages,"C1QB, C1QA, APOC1, FABP4, ALOX5AP, HLA-DQB1, H...","APOC1, MCEMP1, LYZ, ALOX5AP, FABP4, C1QA","C1QA, C1QB, HLA-DRA, TYROBP, APOC1, ALOX5AP, C...","CD68, APOE, APOC1, ALOX5AP, C1QA","ACOT4, FAM89A, CXCL5, APOC4-APOC2, TNNI2, S100...",
4,Club cell (Clara cell),"HP, MUC4, CP, SCGB3A2, TFF3, MUC5B, SCGB1A1, S...",Club,"SCGB3A1, SCGB1A1, SLPI, WFDC2, CLU, NR4A1, PRS...","SCGB1A1, SCGB3A1, TFF3","SCGB3A1, WFDC2, SLPI, SCGB1A1, BPIFB1, LCN2, K...","SCGB1A1, SCGB3A1","BPIFB1, MSMB, TSPAN8, CYP2F1, CXCL6, SERPINB3,...",MUC5B
5,Goblet cell,"XBP1, VMO1, MUC5AC, MUC5B, PIGR, SCGB1A1",Goblet,"SCGB1A1, SCGB3A1, PRSS23, LCN2, TSPAN1, NUCB2,...","SCGB1A1, PIGR","SLPI, WFDC2, SCGB1A1, SCGB3A1, BPIFB1, CXCL17,...","SCGB1A1, XBP1, PIGR","MUC5AC, ENSG00000229401, ENSG00000267353, ENSG...","MUC5B, MUC5AC"
6,Dendritic cell,"CD11c, CCL17, CD137, MHC Class II, PFLT3, FCER...",DC2,"HLA-DQB1, HLA-DQA1, AREG, SGK1, MS4A6A, IGSF6,...",CD83,"HLA-DRA, CD74, HLA-DPB1, HLA-DRB1, HLA-DPA1, H...","CD74, HLA-DRA, FCER1G, TYROBP","FCGR2B, CLEC10A, FCER1A, PLD4, CD1E, CD1C, GPR...","CD1C, CD207, CD1A, CCL17"
7,Dendritic cell,"CD11c, CCL17, CD137, MHC Class II, PFLT3, FCER...",DC1,"HLA-DQB1, HLA-DQA1, C1orf162, HLA-DMA, TUBA1A,...",CD83,"CD74, HLA-DRA, HLA-DPA1, HLA-DPB1, HLA-DQB1, C...","CLEC9A, CD74, HLA-DRA","CLEC9A, CLNK, XCR1, LOC105377043, LINC01976, G...",CLEC9A
8,Ionocyte,"ASCL3, FOXI1, CFTR, CF",Ionocyte,"CD9, CHCHD10, GSN, MAP1LC3B, ID3, ATP6AP2, NDU...",,"RARRES2, KRT18, ELF3, PRDX5, CD9, LGALS3, NDUF...",ASCL3,"ASCL3, CLCNKB, TMPRSS11E, LINC01187, THRSP, FO...","ASCL3, FOXI1"
9,Plasma cell,"HSP90B1, IGHM, IGHGP, SUSD3, IGLC7, XBP1, SDC1...",Plasma cells,"XBP1, MYDGF, PDIA6, SSR3, FKBP2, HM13, ISG20, ...",XBP1,"MZB1, JCHAIN, HSP90B1, FKBP11, CYBA, DERL3, UB...","XBP1, JCHAIN, HSP90B1, DERL3, FKBP11, MZB1","DERL3, POU2AF1, CPNE5, ABCB9, IGLC2, MZB1, SYN...","IGHA1, JCHAIN, IGLL5, DERL3, IGHA2, MZB1, IGKC"


In [103]:
res2.to_csv("supple_table2.csv")

In [107]:
sorted(cellmarker_db_lung.loc[cellmarker_db_lung.cell_name == "Basal cell"].marker.unique())

['ACKR3',
 'ADH7',
 'ANXA8',
 'ANXA8L1',
 'AQP3',
 'ARL4D',
 'ATP1B3',
 'BMP7',
 'C10orf99',
 'C16orf74',
 'CAPG',
 'CAV1',
 'CCND2',
 'CD109',
 'CLCA2',
 'CLCA4',
 'CLDN1',
 'COL17A1',
 'CSTA',
 'DAPL1',
 'DSC3',
 'DSG3',
 'DUSP7',
 'FABP5',
 'FABP5P3',
 'FAT2',
 'FBLN1',
 'FGFBP1',
 'FYB',
 'GJB3',
 'GLTP',
 'GM2A',
 'GPC1',
 'GPNMB',
 'GPX2',
 'HSPB1',
 'IFI27L2',
 'IGFBP6',
 'IL1RN',
 'IL20RB',
 'ITGA6',
 'JUP',
 'KRT13',
 'KRT14',
 'KRT15',
 'KRT16',
 'KRT17',
 'KRT4',
 'KRT5',
 'KRT6A',
 'KRT6B',
 'KRT6C',
 'Krt5',
 'LAD1',
 'LGALS7',
 'LGALS7B',
 'LOXL4',
 'LY6D',
 'LYPD3',
 'MMP28',
 'MT1X',
 'NGFR',
 'NOTCH1',
 'NRARP',
 'NXN',
 'P63',
 'PGM2',
 'PHLDA3',
 'PKP1',
 'PKP3',
 'PLAT',
 'PLP2',
 'POLR2J3',
 'PPP1R13L',
 'PTTG1',
 'PVRL1',
 'Pan-ΔNp63',
 'RAB38',
 'RARG',
 'S100A10',
 'S100A14',
 'S100A16',
 'S100A2',
 'S100A8',
 'SDC1',
 'SERPINB1',
 'SERPINB13',
 'SERPINB2',
 'SERPINB5',
 'SFN',
 'SH3BGRL3',
 'SOX15',
 'SPINK5',
 'SPRR1A',
 'SPRR1B',
 'THBD',
 'TIMP1',
 'TMEM43',