In [None]:
import gzip
import pandas as pd
from functools import reduce

In [None]:
# Loading and process the annotation file to map gene IDs to gene symbols
annotation_file = 'Human.GRCh38.p13.annot.tsv.gz'

with gzip.open(annotation_file, 'rt') as f:
    annotations = pd.read_csv(f, sep='\t')

# Ensuring gene IDs are strings and strip whitespace
annotations['GeneID'] = annotations['GeneID'].astype(str).str.strip()
annotations['Symbol'] = annotations['Symbol'].str.strip()

# Creating a dictionary for gene ID to symbol mapping
id_to_symbol = dict(zip(annotations['GeneID'], annotations['Symbol']))

# List of GSE dataset file paths
gse_files = ["GSE114691_DEG.csv", "GSE148241_DEG.csv", "GSE190971_DEG.csv", "GSE234729_DEG.csv"]  # Add your file paths here

# List to store the gene symbols for each dataset
gene_symbol_lists = []

for gse_file in gse_files:
    # Loading the dataset
    df = pd.read_csv(gse_file)
    
    # Setting the first column as the index and remove its name
    df.set_index(df.columns[0], inplace=True, drop=True)
    df.index.name = None

    # Mapping gene IDs to symbols in the columns, keeping original IDs if no match is found
    df.columns = [id_to_symbol.get(gene, gene) for gene in df.columns]
    
    # Adding unique gene symbols from this dataset to the list
    gene_symbol_lists.append(set(df.columns))

# Finding the union of gene symbols across all datasets
common_genes = reduce(lambda x, y: x | y, gene_symbol_lists)

# Saving the common genes to a CSV file
common_genes_df = pd.DataFrame(list(common_genes), columns=['CommonGenes'])
common_genes_df.to_csv("common_DEGs.csv", index=False)

print("Common genes saved to 'common_genes.csv'")

### union from feature selection

In [None]:
# Paths to the CSV files containing the top features
file_paths = [
    "GSE114691_top50_list.csv",
    "GSE148241_top40_list.csv",
    "GSE190971_top40_list.csv",
    "GSE234729_top50_list.csv"
]

# Loading each file and get the feature list as a set
feature_sets = [set(pd.read_csv(file)["feature"].tolist()) for file in file_paths]

# Finding the union of features across all sets
union_features = set.union(*feature_sets)

# Converting to a list and save to a CSV
union_features_list = list(union_features)
union_features_df = pd.DataFrame(union_features_list, columns=["feature"])
union_features_df.to_csv("union_features.csv", index=False)

# Printing results
print("Union of features saved to union_features.csv")
print("Number of unique features:", len(union_features_list))
print("Union of features:", union_features_list)

### common among DEGs and feature selection genes

In [None]:
# Loading the union features from the previous CSV
union_features_df = pd.read_csv("union_features.csv")
union_features_set = set(union_features_df["feature"].tolist())

# Loading the DataFrame with the CommonGenes column
common_genes_df = pd.read_csv("common_DEGs.csv")  # Replace with your file path
common_genes_set = set(common_genes_df["CommonGenes"].tolist())

# Finding the intersection of the two sets
intersection_features = union_features_set.intersection(common_genes_set)

# Converting the intersection to a DataFrame and save to CSV
intersection_features_list = list(intersection_features)
intersection_features_df = pd.DataFrame(intersection_features_list, columns=["Common Genes"])
intersection_features_df.to_csv("intersection_features.csv", index=False)

# Printing results
print("Intersection of features saved to intersection_features.csv")
print("Number of common genes:", len(intersection_features_list))
print("Common genes:", intersection_features_list)

### Feature ranking

In [None]:
# Paths to the CSV files containing the top features
file_paths = [
    "GSE114691_feature_importance_ranking.csv",
    "GSE148241_feature_importance_ranking.csv",
    "GSE190971_feature_importance_ranking.csv",
    "GSE234729_feature_importance_ranking.csv"
]

# Loading each file and get the feature list as a set
feature_sets = [set(pd.read_csv(file)["feature"].tolist()) for file in file_paths]

# Finding the union of features across all sets
union_features = set.union(*feature_sets)

# Converting to a list and save to a CSV
union_features_list = list(union_features)
union_features_df = pd.DataFrame(union_features_list, columns=["feature"])
union_features_df.to_csv("combined_genes_afterRF.csv", index=False)

# Printing results
print("Union of features saved to combined_genes_afterRF.csv")
print("Number of unique features:", len(union_features_list))
print("Union of features:", union_features_list)