In [1]:
import pandas as pd
from SuperSCC import *
import numpy as np
import mygene

from os.path import basename
import re
import os
from copy import deepcopy

In [3]:
os.chdir("/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/compare_with_cell_marker_db")

In [2]:
def ensemblID2Symbol(id_list: list[str])-> list:
    """
    A function to convert Ensembl gene ids to gene symbols.

    Parameters
    ----------
    id_list:
        A list containing ensembl ids. e.g. ['ENSG00000139618', 'ENSG00000225972', 'ENSG00000186092']

    Return
    ------
        gene symbols for the query gene ids. e.g. ['BRCA2', 'MTND1P23', 'OR4F5']
    """
    mg = mygene.MyGeneInfo()
    symbols = mg.querymany(id_list, scopes='ensembl.gene', fields='symbol', species='human')
    symbols = [i["symbol"] for i in symbols]
    return symbols

In [None]:
# get the SuperSCC markers for each cell type on dataset Habermann_2020

file =  pd.read_csv("/mnt/disk5/zhongmin/superscc/结果位置/结果位置_3.csv", encoding = "GBK", index_col = 0)

# get the expression matrix
expression = pd.read_csv(file.iloc[:, 9][2])

# get the cell type annotation meta
meta = pd.read_csv(file.iloc[:, 6][2])

# insert cell type column into expression matrix
expression.loc[:, "cell_type"] = meta.ann_level_4.tolist()

# remove cells with NaN cell labels
expression = expression.loc[expression.cell_type.isna() == False, :]

cell_type = expression.cell_type.tolist()
expression = expression.select_dtypes("number")

# do normalization 
norm_exp = pre_processing(expression)

# insert cell type column into normalized expression matrix
norm_exp.loc[:, "cell_type"] = cell_type

# run SuperSCC to get markers of cell type
res = find_markers_ovr(norm_exp, label_column = "cell_type", filename = "Habermann_2020_2025_06_25", save = True, n_jobs = 20)

In [None]:
# retrieve the SuperSCC positive markers 
superscc_data = {"superscc": deepcopy(res)}

superscc_data2 = dict()

for i in superscc_data:
    
    for j in superscc_data[i]:
        info_features = [z[0] for z in superscc_data[i][j]["features"]["final_feature_selection_by_ensemble"]]
        expression = superscc_data[i][j]["sub_high_expression_genes"][j]
        expression = expression.loc[expression.feature.isin(info_features), :]
        
        expression.loc[:, "log2_fold_change"] = np.log2(expression.expression1 / expression.expression2)
        expression = expression.loc[((expression.expression1 > 1) & (1 - (expression.pct2/expression.pct1) > 0.5)) | (np.isnan(expression.pct2)) ].sort_values("score", ascending = False)
        
        try:
            superscc_data2[i].update({j: ensemblID2Symbol(expression.feature[0:20].tolist())})
        except:
            try:
                superscc_data2[i] = {j: ensemblID2Symbol(expression.feature[0:20].tolist())}
            except:
                pass

In [None]:
#  get the cell marker database 
cellmarker_db = pd.read_excel("Cell_marker_Human.xlsx")

# only retain lung marker sets
cellmarker_db_lung = cellmarker_db.loc[cellmarker_db.tissue_type == "Lung"]

In [None]:
# match labels between dataset Habermann_2020 and cellmarker db
cell_type_match = {
                   "CD8 T cells": "CD8+ T cell",
                   "CD4 T cells": "CD4+ T cell",
                   "Interstitial macrophages": "Interstitial macrophage",
                   "Alveolar macrophages": "Alveolar macrophage",
                   "Club": "Club cell (Clara cell)",
                   "Goblet": 'Goblet cell',
                   "DC2": "Dendritic cell",
                   "DC1": "Dendritic cell",
                   "Ionocyte": "Ionocyte",
                   "Plasma cells": "Plasma cell",
                   "B cells": 'B cell',
                   "Basal resting": "Basal cell", 
                   "Pericytes": "Pericyte",
                   "Transitional Club-AT2": "Alveolar cell Type 2",
                   "NK cells": "Natural killer cell",
                   "Neuroendocrine": "Neuroendocrine cell",
                   'Non-classical monocytes': "Monocyte",
                   "Adventitial fibroblasts": "Fibroblast",
                   "Alveolar fibroblasts": "Fibroblast-like cell",
                   "EC general capillary": "Capillary cell"
                   }

In [None]:
# get the insersection for cell type markers between dataset Habermann_2020 and cellmarker db

ls = list()
for i in cell_type_match:
    superscc_markers = superscc_data2["place_holder"][i]
    cell_type = cell_type_match[i]
    cell_db_markers = cellmarker_db_lung.loc[cellmarker_db_lung.cell_name == cell_type].marker.unique()

    intersection = set(superscc_markers).intersection(cell_db_markers)

    if(len(intersection) == 0):
        intersection = "NULL"
    else:
        intersection = ", ".join(intersection)
    
    cell_db_markers = ", ".join(cell_db_markers)
    cell_db_cell_type = cell_type_match[i]

    superscc_markers = ", ".join(superscc_markers)
    superscc_cell_type = i

    df = pd.DataFrame({
        "Cell_marker_db_cell_label": cell_db_cell_type,
        "Cell_marker_db_markers": cell_db_markers,
        "SupeSCC's_working_dataset_cell_label": superscc_cell_type,
        "SuperSCC_markers": superscc_markers,
        "Intersection": intersection
    },
    index=[0]
    )

    ls.append(df)
    

In [None]:
res2 = pd.concat(ls)
res2.index = range(res.shape[0])

In [None]:
res2

Unnamed: 0,Cell_marker_db_cell_label,Cell_marker_db_markers,SupeSCC's_working_dataset_cell_label,SuperSCC_markers,Intersection
0,CD8+ T cell,"CD103, CD4, CD8A, CD3E, FASLG, ITGA1, GZMK, JA...",CD8 T cells,"NKG7, CCL4, CCL5, GNLY, IL32, DUSP2, CST7, CD3...","CD3D, CD8A, CD3E, CCL5"
1,CD4+ T cell,"CD8A, CD4, CD3E, Mini80, CCR7, IL7R, FOXP3, IL...",CD4 T cells,"IL32, IL7R, CCL5, CD69, CORO1A, CD3E, CXCR4, C...","CD3D, CD8A, CD3E, IL7R"
2,Interstitial macrophage,"CXCL11, CXCL10, CCL4, ISG20, ISG15",Interstitial macrophages,"CD68, C1QA, CAPG, MARCO, C1QB, MRC1, PLAUR, MC...",
3,Alveolar macrophage,"CD68, FBP1, MSR1, MARCO, APOE, GPNMB, INHBA, A...",Alveolar macrophages,"C1QB, C1QA, APOC1, FABP4, ALOX5AP, HLA-DQB1, H...","MCEMP1, C1QA, FABP4, ALOX5AP, LYZ, APOC1"
4,Club cell (Clara cell),"SCGB1A1, SCGB1A1 , SCGB3A1, SCGB3A2, MUC4, TFF...",Club,"SCGB3A1, SCGB1A1, SLPI, WFDC2, CLU, NR4A1, PRS...","SCGB1A1, TFF3, SCGB3A1"
5,Goblet cell,"MUC5AC, XBP1, PIGR, SCGB1A1, VMO1, MUC5B",Goblet,"SCGB1A1, SCGB3A1, PRSS23, LCN2, TSPAN1, NUCB2,...","SCGB1A1, PIGR"
6,Dendritic cell,"CD1a, MHC Class II, CD83, CD303, CD141, CD1c, ...",DC2,"HLA-DQB1, HLA-DQA1, AREG, SGK1, MS4A6A, IGSF6,...",CD83
7,Dendritic cell,"CD1a, MHC Class II, CD83, CD303, CD141, CD1c, ...",DC1,"HLA-DQB1, HLA-DQA1, C1orf162, HLA-DMA, TUBA1A,...",CD83
8,Ionocyte,"FOXI1, CFTR, CF, ASCL3",Ionocyte,"CD9, CHCHD10, GSN, MAP1LC3B, ID3, ATP6AP2, NDU...",
9,Plasma cell,"MZB1, XBP1, SDC1, JCHAIN, TNFRSF17, CD38, IGHG...",Plasma cells,"XBP1, MYDGF, PDIA6, SSR3, FKBP2, HM13, ISG20, ...",XBP1
