In [None]:
import pandas as pd
import scanpy as sc
import numpy as np
import re
from pydantic import BaseModel, ValidationError

import os 
from collections import Counter
from os.path import basename

In [2]:
os.chdir("C:/Users/76361/OneDrive/桌面/GPT-based feature evulation")

In [90]:
def list_files(path, pattern, recursive = True, full_name = True):
    """
    A function to produce a list of the names of files in the named directory.
    """
    output = []
    def list_files_core(current_path = path, current_pattern = pattern, current_recursive = recursive, current_full_name = full_name):
        nonlocal output
        files = os.listdir(current_path)
        for file in files:
            file_path = os.path.join(current_path, file)
            
            if os.path.isdir(file_path) and current_recursive:
                list_files_core(file_path, current_pattern, current_recursive, current_full_name)
            
            else:
                if re.search(current_pattern, file):
                    if full_name == True:
                        file = os.path.join(current_path, file)
                        output.append(file)
                    else:
                        output.append(file)
    list_files_core()
    return output

In [3]:
# SuperSCC markers
superscc_markers = pd.read_pickle("SuperSCC_default_retrieving_method_top_20_markers.pkl")

# scanpy wilcox test markers
scanpy_wilcox_markers = pd.read_pickle("scanpy_wilicox_test_top_20_markers.pkl")

# scanpy t test markers
scanpy_t_markers = pd.read_pickle("scanpy_t_test_top_20_markers.pkl")

In [109]:
# get the id2symbol file
reference = pd.read_csv("human_id2symbol.csv")

In [110]:
def id2symbol(reference, query, multi_select = "first"):
     query = pd.DataFrame({"gene_id": query})

     query = query.join(reference.set_index("gene_id"), how = "left", on = "gene_id")
     return query.gene_name.values.tolist()

In [45]:
# convert IDs to gene symbols for SuperSCC markers
superscc_symbols = dict()
for i in superscc_markers:
    for idx, j in enumerate(superscc_markers[i]):
        query = superscc_markers[i][j]["feature"].values
        try:
            condition = query[0]
        except:
            print(f"No satified markers with default cutoff in Dataset '{i}' for cell type '{j}'")
            continue
        
        if condition.startswith("ENSG"):
            symbol = id2symbol(reference, query)
        else:
            symbol = list(query)

        if idx == 0:
            superscc_symbols[i] = {j: symbol}
        else:
            superscc_symbols[i].update({j: symbol})

No satified markers with default cutoff in Dataset 'D034' for cell type 'regulatory T cell'
No satified markers with default cutoff in Dataset 'D034' for cell type 'mast cell'


In [None]:
# convert IDs to gene symbols for scanpy markers
scanpy_wilcox_symbols = dict()
for i in scanpy_wilcox_markers:
    for idx, j in enumerate(scanpy_wilcox_markers[i]):
        query = scanpy_wilcox_markers[i][j]["names"].values

        try:
            condition = query[0]
        except:
            print(f"No satified markers with default cutoff in Dataset '{i}' for cell type '{j}'")
            continue

        if condition.startswith("ENSG"):
            symbol = id2symbol(reference, query)
        else:
            symbol = list(query)


        if idx == 0:
            scanpy_wilcox_symbols[i] = {j: symbol}
        else:
            scanpy_wilcox_symbols[i].update({j: symbol})

In [91]:
# get the seurat markers
files = list_files(path="C:/Users/76361/OneDrive/桌面/GPT-based feature evulation/seurat_res", pattern=".+csv$")

seurat_wilcox_markers = dict()

for file in files:
    csv = pd.read_csv(file)
    csv = csv.loc[(csv.p_val_adj < 0.05) & (csv.avg_log2FC > 1)].sort_values("p_val_adj")
    name = re.sub("_seurat_feature.csv", "", basename(file))

    group_csv = csv.groupby("cluster")

    for idx, i in enumerate(group_csv.groups.keys()):
        markers = group_csv.get_group(i).head(20).gene.values.tolist()

        if idx == 0:
            seurat_wilcox_markers[name] = {i: markers}
        else:
            seurat_wilcox_markers[name].update({i: markers})


In [None]:
# convert IDs to gene symbols for seurat markers
seurat_wilcox_symbols = dict()

for i in seurat_wilcox_markers:
    for idx, j in enumerate(seurat_wilcox_markers[i]):
        query = seurat_wilcox_markers[i][j]

        try:
            condition = query[0]
            print(condition)
        except:
            print(f"No satified markers with default cutoff in Dataset '{i}' for cell type '{j}'")
            continue

        if condition.startswith("ENSG"):
            symbol = id2symbol(reference, query)
        else:
            symbol = query


        if idx == 0:
            seurat_wilcox_symbols[i] = {j: symbol}
        else:
            seurat_wilcox_symbols[i].update({j: symbol})

In [None]:
# only keep shared keys between all algorithem
shared_keys = dict()

for i in superscc_symbols.keys():
    for idx, j in enumerate(superscc_symbols[i].keys()):
        if j in scanpy_wilcox_symbols[i].keys() and len(superscc_symbols[i][j]) != 0:
            if idx == 0:
                shared_keys[i] = {j: 0}
            else:
                shared_keys[i].update({j: 0})
                
new_seurat = dict()
for i in shared_keys:
    for idx, j in enumerate(shared_keys[i]):
        if idx == 0:
            new_seurat[i] = {j : seurat_wilcox_symbols[i][j]}
        else:
             new_seurat[i].update({j : seurat_wilcox_symbols[i][j]})

In [None]:
# set the home dir 
home = "C:/Users/76361/OneDrive/桌面/GPT-based feature evulation"

In [None]:
# Structure the output class
class Output(BaseModel):
    GenesetName: list[str]
    RelevantGeneRatio: list[float]
    Pvalue: list[float]
    BiologicalRelevanceScore: list[str]
    Summary: list[str]

In [None]:
from openai import OpenAI

for key1 in shared_keys:

    if not os.path.exists(key1):
        os.mkdir(key1)
        os.chdir(key1)
    else:
        os.chdir(key1)

    for key2 in shared_keys[key1]:

        print(f"Processing with {key1}_{key2}")

        client = OpenAI(api_key="********") # asterisk represents the API key

        SYSTEM_CONTENT = \
        """
        Suppose you are an insightful biologist tasked with evaluating two gene sets to determine which one better reflects the underlying biological function. 
        You will use both Gene Ontology and KEGG databases to design scoring metrics. 
        If cell type labels are provided, evaluate which gene set is a better representative of that specific cell type. 
        Gene Set Format: Input gene sets can be in gene symbol or Ensembl ID format. If Ensembl IDs are provided, automatically convert them to gene symbols, ensuring the accuracy of the conversion. 
        Evaluation Method: Measure the ratio of relevant genes (genes associated with the cell type) in each gene set. 
        For this comparison, use a statistical test like Fisher’s exact test (or chi-squared test if applicable), 
        ensuring that the calculation detail is shown and accuracy is guaranteed (e.g. make sure 2x2 contingency table is used for Fisher’s exact test ). 
        Also the evaluation should be independent of gene set order. Normalize the ratio to account for any differences in gene set size. 
        Scoring Metrics: Relevant Gene Ratio: The proportion of relevant genes in each gene set. 
        Biological Relevance Score: Derived from Gene Ontology and KEGG pathways, reflecting the biological function of the gene set. 
        Statistical Test Result: Provide the p-value from the test comparing the relevant gene ratios between the two gene sets. 
        Output: Present the results in a CSV file with the following columns: Gene Set Name: The name of the gene set being evaluated. 
        Gene List: Comma-separated list of genes in the set. Relevant Gene Ratio: The proportion of relevant genes in the set. 
        P-value: From the statistical comparison. Biological Relevance Score: Based on Gene Ontology and KEGG database associations. 
        Summary: A brief summary of the gene functions or pathway associations for each gene in the gene set. 
        Cutoff Determination: Determine the cutoff for gene relevance based on the data distribution (e.g., by using the median or 75th percentile of relevant genes across the dataset). 

        """
        messages = [{'role': 'system', 'content': SYSTEM_CONTENT}]

        USER_CONTENT = \
        f"""
        cell type: {key2}

        gene set1: {superscc_symbols[key1][key2]}

        gene set2: {new_seurat[key1][key2]} 
        """
        # alternatively, gene set2:{scanpy_wilcox_symbols[key1][key2]}

        messages.append({'role': 'user', 'content': USER_CONTENT})

        response = client.beta.chat.completions.parse(
            model = "gpt-4o-2024-11-20",
            messages = messages,
            # max_tokens = 10000,
            # top_p = 0.8,
            )

        detail = response.choices[0].message.content

        completion = client.beta.chat.completions.parse(
            model="gpt-4o-2024-11-20",
            messages=[
                {"role": "system", "content": "You are an expert at structured data extraction. You will be given unstructured text from the input and should convert it into the given structure. "},
                {"role": "user", "content": detail}
                    ],
            response_format=Output,
            )
        
        summary =  completion.choices[0].message.content

        if re.search("/", key2):
            key2 = re.sub("/", "-", key2)
            
        try:
            with open(f"{key1}_{key2}_detail_result.txt", "w") as file:
                try: 
                    file.write(detail)
                except:
                    detail = detail.encode("utf-8", errors = "ignore").decode("utf-8")
                    file.write(detail)

            with open(f"{key1}_{key2}_summary_result.txt", "w") as file:
                file.write(summary)
        except:
            continue

    os.chdir(home)        