In [1]:
import os 
from os.path import basename 
from typing import List
import warnings
from io import StringIO

import pandas as pd
import numpy as np
from pydantic import BaseModel, Field
# from SuperSCC import list_files
import re

In [2]:
os.chdir("/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion")

In [3]:
def get_loc(string, num):
    return re.split("/", string)[num]

In [4]:
def list_files(path, pattern, recursive = True, full_name = True):
    """
    A function to produce a list of the names of files in the named directory.

    Parameters
    -----------
    path: 
        A string to indicate the working directory where the files will be recursively found.
    pattern: 
        A regular expression to match the name of wanted files. 
    recursive: 
        A Bool value to decide whether to recursive into the directories. Default is True.
    full_name: 
        A Bool value to decide whether the directory path is prepended to the file names. Default is True.
    """
    output = []
    def list_files_core(current_path = path, current_pattern = pattern, current_recursive = recursive, current_full_name = full_name):
        nonlocal output
        files = os.listdir(current_path)
        for file in files:
            file_path = os.path.join(current_path, file)
            
            if os.path.isdir(file_path) and current_recursive:
                list_files_core(file_path, current_pattern, current_recursive, current_full_name)
            
            else:
                if re.search(current_pattern, file):
                    if full_name == True:
                        file = os.path.join(current_path, file)
                        output.append(file)
                    else:
                        output.append(file)
    list_files_core()
    return output

In [5]:
class Output(BaseModel):
    GeneSetName: List[str] = Field(..., description = "The name of the gene set being evaluated")
    GeneList: List[list] = Field(..., description = "Comma-separated list of genes in the set")
    RelevantGeneRatio: List[float] = Field(..., description = "The proportion of relevant genes in the set")
    BiologicalRelevanceScore: List[float] = Field(..., description = "Derived from Gene Ontology and KEGG pathways, reflecting the biological function of the gene set")
    Pvalue: List[list] = Field(..., description = "A value from the statistical comparison")
    SetvsSet: List[list] = Field(..., description = "Gene set names for comparison")
    Summary: List[str] = Field(..., description = "A brief summary of the gene functions or pathway associations for each gene in the gene set")
    Conclusion: List[str] = Field(..., description = "A clear conclusion to indicate the gene set (e.g gene set1) as a better representative of that specific cell type")


In [None]:
# qwen 0.1 temperature
qwen_temp_one_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/qwen-max/tempeature_0.1", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in qwen_temp_one_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in qwen_temp_one_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            if group == "GSE136831_Kaminski_2020":
                print(cell_type)
                if celltype in ["Endothelial", "Lymphoid", "Multiplet"]:
                    data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
            else:
                data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]})) 
        except:
            pass
            # print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("qwen_temp_one_tenth_evaulation_res.csv")

In [None]:
# qwen 0.5 temperature
qwen_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/qwen-max/tempeature_0.5", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in qwen_temp_five_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in qwen_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio.append, "BiologicalRelevanceScore": res.BiologicalRelevanceScore.append, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("qwen_temp_five_tenth_evaulation_res.csv")

In [None]:
# qwen 0.9 temperature
qwen_temp_nine_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/qwen-max/tempeature_0.9", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in qwen_temp_nine_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in qwen_temp_nine_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("qwen_temp_nine_tenth_evaulation_res.csv")

In [None]:
# gpt4.1-mini 0.1 temperature
gpt_4_1_mini_temp_one_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/gpt4.1-mini/20_markers/tempeature_0.1", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 11) for i in gpt_4_1_mini_temp_one_tenth])

test = list()

groups = dict()
for j in datasets:
    ls = list()
    for i in gpt_4_1_mini_temp_one_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        output = pd.read_pickle(i)
        output = output.content
        output = output.replace("`", "").replace("json", "").replace("\n", "")
        celltype = re.findall("[^_]+", basename(i))[0]
    
        try:
            res = pd.read_json(output)
            res = res.iloc[:, [0, 2, 3]]
            res.loc[:, "cell_type"] = [celltype for i in range(5)]
            res.loc[:, "dataset"] = [group for i in range(5)]
            data.append(res)
        except:
            try:
                pattern = re.compile('(\"GeneSetName\".*(?=\"GeneList\"))|(\"RelevantGeneRatio\".*(?=\"BiologicalRelevanceScore\"))|(\"BiologicalRelevanceScore\".*(?=\"Pvalue\"))')
                search_res = re.findall(pattern, output)
                search_res = "".join([search_res[0][0], search_res[1][1], search_res[2][2]])
                search_res = "{" + search_res + "}"

                res = pd.read_json(search_res)
                res.loc[:, "cell_type"] = [celltype for i in range(5)]
                res.loc[:, "dataset"] = [group for i in range(5)]
                data.append(res)
            except:
                test.append((celltype, search_res))
                print(f"Fail on {group}_{celltype}")

data = pd.concat(data)
data.rename(columns = {"GeneSetName": "GeneSet", "dataset": "Dataset"}, inplace = True)
data.to_csv("gpt_4_1_mini_temp_one_tenth_evaulation_res.csv")

In [None]:
# gpt4.1-mini 0.5 temperature
gpt_4_1_mini_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/gpt4.1-mini/tempeature_0.5", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in gpt_4_1_mini_temp_five_tenth])

test = list()

groups = dict()
for j in datasets:
    ls = list()
    for i in gpt_4_1_mini_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        output = pd.read_pickle(i)
        output = output.content
        output = output.replace("`", "").replace("json", "").replace("\n", "")
        celltype = re.findall("[^_]+", basename(i))[0]


        try:
            res = pd.read_json(output)
            res = res.iloc[:, [0, 2, 3]]
            res.loc[:, "cell_type"] = [celltype for i in range(5)]
            res.loc[:, "dataset"] = [group for i in range(5)]
            data.append(res)
        except:
            try:
                pattern = re.compile('(\"GeneSetName\".*(?=\"GeneList\"))|(\"RelevantGeneRatio\".*(?=\"BiologicalRelevanceScore\"))|(\"BiologicalRelevanceScore\".*(?=\"Pvalue\"))')
                search_res = re.findall(pattern, output)
                search_res = "".join([search_res[0][0], search_res[1][1], search_res[2][2]])
                search_res = "{" + search_res + "}"


                res = pd.read_json(search_res)
                res.loc[:, "cell_type"] = [celltype for i in range(5)]
                res.loc[:, "dataset"] = [group for i in range(5)]
                data.append(res)
            except:
                test.append((celltype, search_res))
                print(f"Fail on {group}_{celltype}")

data = pd.concat(data)
data.rename(columns = {"GeneSetName": "GeneSet",  "dataset": "Dataset"}, inplace = True)
data.to_csv("gpt_4_1_mini_temp_five_tenth_evaulation_res.csv")

In [None]:
# gpt4.1-mini 0.9 temperature
gpt_4_1_mini_temp_nine_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/gpt4.1-mini/tempeature_0.9", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in gpt_4_1_mini_temp_nine_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in gpt_4_1_mini_temp_nine_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        output = pd.read_pickle(i)
        output = output.content
        output = output.replace("`", "").replace("json", "").replace("\n", "")
        celltype = re.findall("[^_]+", basename(i))[0]

        try:
            res = pd.read_json(output)
            res = res.iloc[:, [0, 2, 3]]
            res.loc[:, "cell_type"] = [celltype for i in range(5)]
            res.loc[:, "dataset"] = [group for i in range(5)]
            data.append(res)
        except:
            try:
                pattern = re.compile('(\"GeneSetName\".*(?=\"GeneList\"))|(\"RelevantGeneRatio\".*(?=\"BiologicalRelevanceScore\"))|(\"BiologicalRelevanceScore\".*(?=\"Pvalue\"))')
                search_res = re.findall(pattern, output)
                search_res = "".join([search_res[0][0], search_res[1][1], search_res[2][2]])
                search_res = "{" + search_res + "}"


                res = pd.read_json(search_res)
                res.loc[:, "cell_type"] = [celltype for i in range(5)]
                res.loc[:, "dataset"] = [group for i in range(5)]
                data.append(res)
            except:
                test.append((celltype, search_res))
                print(f"Fail on {group}_{celltype}")

data = pd.concat(data)
data.rename(columns = {"GeneSetName": "GeneSet", "dataset": "Dataset"}, inplace = True)
data.to_csv("gpt_4_1_mini_temp_nine_tenth_evaulation_res.csv")

In [None]:
# deepseek v3 0.1 temperature
deepseek_v3_temp_one_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/DeepSeek-V3/tempeature_0.1", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in deepseek_v3_temp_one_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in deepseek_v3_temp_one_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("deepseek_v3_temp_one_tenth_evaulation_res.csv")

In [None]:
# deepseek v3 0.5 temperature
deepseek_v3_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/DeepSeek-V3/tempeature_0.5", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in deepseek_v3_temp_five_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in deepseek_v3_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("deepseek_v3_temp_five_tenth_evaulation_res.csv")

In [None]:
# deepseek v3 0.9 temperature
deepseek_v3_temp_nine_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/DeepSeek-V3/tempeature_0.9", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 10) for i in deepseek_v3_temp_nine_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in deepseek_v3_temp_nine_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, ,"CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("deepseek_v3_temp_nine_tenth_evaulation_res.csv")

In [None]:
# original prompt qwen-max 0.5 temperature with only top 10 markers being evaulated 
top10_markers_qwen_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/original_prompt/qwen-max/10_markers/tempeature_0.5", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 12) for i in top10_markers_qwen_temp_five_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in top10_markers_qwen_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("top10_markers_qwen_temp_five_tenth_evaulation_res.csv")

In [None]:
# original prompt deepseek-v3 0.5 temperature with only top 10 markers being evaulated 
top10_markers_deepseek_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/original_prompt/DeepSeek-V3/10_markers/tempeature_0.5", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 12) for i in top10_markers_deepseek_temp_five_tenth])

groups = dict()
for j in datasets:
    ls = list()
    for i in top10_markers_deepseek_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        res = pd.read_pickle(i)
        celltype = re.findall("[^_]+", basename(i))[0]
        try:
            data.append(pd.DataFrame({"GeneSet": res.GeneSetName, "RelevantGeneRatio": res.RelevantGeneRatio, "BiologicalRelevanceScore": res.BiologicalRelevanceScore, "CellType": [celltype for i in range(5)], "Dataset": [group for i in range(5)]}))
        except:
            print(f"{group}_{celltype}")

data = pd.concat(data)
data.to_csv("top10_markers_deepseek_temp_five_tenth_evaulation_res.csv")

In [None]:
# original prompt got-4.1-mini 0.5 temperature with only top 10 markers being evaulated 
gpt_4_1_mini_temp_five_tenth = list_files(path = "/home/fengtang/jupyter_notebooks/working_script/evulate_feature_selection/2nd_submssion/evaluation_res/original_prompt/gpt4.1-mini/10_markers/gpt-4.1-mini/tempeature_0.5/", pattern = ".+pkl$", full_name = True, recursive=True)
datasets = set([get_loc(i, 13) for i in gpt_4_1_mini_temp_five_tenth])

test = list()

groups = dict()
for j in datasets:
    ls = list()
    for i in gpt_4_1_mini_temp_five_tenth:
        if re.search(j, i):
            ls.append(i)
    groups[j] = ls

data = list()
for group in groups:
    loc = groups[group]
    for i in loc:
        output = pd.read_pickle(i)
        output = output.content
        output = output.replace("`", "").replace("json", "").replace("\n", "")
        celltype = re.findall("[^_]+", basename(i))[0]

        try:
            res = pd.read_json(output)
            res = res.iloc[:, [0, 2, 3]]
            res.loc[:, "cell_type"] = [celltype for i in range(5)]
            res.loc[:, "dataset"] = [group for i in range(5)]
            data.append(res)
        except:
            try:
                pattern = re.compile('(\"GeneSetName\".*(?=\"GeneList\"))|(\"RelevantGeneRatio\".*(?=\"BiologicalRelevanceScore\"))|(\"BiologicalRelevanceScore\".*(?=\"Pvalue\"))')
                search_res = re.findall(pattern, output)
                search_res = "".join([search_res[0][0], search_res[1][1], search_res[2][2]])
                search_res = "{" + search_res + "}"


                res = pd.read_json(search_res)
                res.loc[:, "cell_type"] = [celltype for i in range(5)]
                res.loc[:, "dataset"] = [group for i in range(5)]
                data.append(res)
            except:
                test.append((celltype, search_res))
                print(f"Fail on {group}_{celltype}")

data = pd.concat(data)
data.rename(columns = {"GeneSetName": "GeneSet", "dataset": "Dataset"}, inplace = True)
data.to_csv("top10_markers_gpt_4_1_mini_temp_five_tenth_evaulation_res.csv")