# Compare gene list to data-driven list

1. SoM Score : 10.1016/j.immuni.2025.05.020; https://www.cell.com/immunity/fulltext/S1074-7613(21)00114-X
2. IHM Score : immune health metric (IHM): https://pubmed.ncbi.nlm.nih.gov/38961223/

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
# SoM (Sepsis-associated Modules) and IHM (Immune Health Metric) Gene Lists

# SoM Genes - Sepsis-associated Modules
SoM_genes = {
    "Module_1": [
        "NQO2", "SLPI", "ORM1", "KLHL2", "ANXA3", "TXN", "AQP9", 
        "BCL6", "DOK3", "PFKFB4", "TYK2"
    ],
    "Module_2": [
        "BCL2L11", "BCAT1", "BTBD7", "CEP55", "HMMR", "PRC1", "KIF15", 
        "CAMP", "CEACAM8", "DEFA4", "LCN2", "CTSG", "AZU1"
    ],
    "Module_3": [
        "MAFB", "OASL", "UBE2L6", "VAMP5", "CCL2", "NAPA", "ATG3", 
        "VRK2", "TMEM123", "CASP7"
    ],
    "Module_4": [
        "DOK2", "HLA-DPB1", "BUB3", "SMYD2", "SIDT1", "EXOC2", 
        "TRIB2", "KLRB1"
    ]
}

# IHM Genes - Immune Health Metric
IHM_genes = [
    "SLC16A10", "NDRG2", "AK5", "CD7", "RNFT2", "PHC1", "MAN1C1", 
    "FAM102A", "RCAN3", "EIF3H", "RPS17", "RPS5", "PLXDC1", "TESPA1", 
    "SGK223", "EIF3L", "ANAPC16", "CD27", "NPAT", "ID3", "RACK1", 
    "APEX1", "GCNT4", "FCMR", "TCF7", "KLHL3", "AXIN2", "LY9", 
    "RPS25", "LDHB", "PKIA", "RPL3", "N6AMT1", "GAL3ST4", "SSBP2", 
    "CD1C", "LEF1", "RPL7", "PIK3IP1", "GPRASP1", "ABI2", "APBB1", 
    "SPTBN1", "GPA33", "CCR9", "BCKDHB", "SCAI", "RPL4", "NOG", 
    "TCEA3", "ETS1", "LDLRAP1", "GPR183", "ZNF548", "ZNF91", "NPM1", 
    "MSANTD2", "KAT6B", "SLC7A6", "DCHS1", "OXNAD1", "RPS2", "RPL7A", 
    "GRAP", "RPL23A", "RPL10A", "RPS3", "FAM175A", "RPL29", "EEF2", 
    "EDAR", "ABLIM1", "MBLAC2", "CCR7", "ZNF573", "CAMK4", "LRRN3", 
    "MAGI3", "RPLP2", "ZIK1", "NT5E", "FUT8", "ZNF101", "RPL34", 
    "RPS20", "FOXP1", "ZNF550", "TSPYL2", "ATP6V0E2-AS1", "GRPEL2", 
    "MGC57346", "SEPT1", "PRKACB", "AGMAT", "RPL11", "MYC", "ZZZ3", "RPL5"
]

# Sepsis Candidate Genes (from the research paper)
Sepsis_Candidate_genes = [
    "SIRT1", "C5AR1", "TLR4", "NFKB1", "ADAM17", "FOXO3", "CD40LG", 
    "CX3CR1", "HIF1A", "HMGB1", "NLRP3", "STAT3", "IL10", "TREM1", 
    "S100A9", "HDAC6", "CXCR2", "ADAM10", "VWF", "CD14", "PDCD1", 
    "CD38", "HMOX1", "LCN2", "PTX3", "ANGPT2", "ADM"
]

# Summary counts
print("SoM Gene Counts:")
for module, genes in SoM_genes.items():
    print(f"  {module}: {len(genes)} genes")

print(f"\nIHM Genes: {len(IHM_genes)} genes")
print(f"Sepsis Candidate Genes: {len(Sepsis_Candidate_genes)} genes")

# Function to find overlaps
def find_overlaps(list1, list2, name1="List1", name2="List2"):
    """Find overlapping genes between two lists"""
    set1 = set(list1)
    set2 = set(list2)
    overlap = set1.intersection(set2)
    
    if overlap:
        print(f"\nOverlap between {name1} and {name2}:")
        for gene in sorted(overlap):
            print(f"  {gene}")
    else:
        print(f"\nNo overlap between {name1} and {name2}")
    
    return overlap

# Find overlaps
all_som_genes = []
for module_genes in SoM_genes.values():
    all_som_genes.extend(module_genes)

print("\n" + "="*50)
print("OVERLAP ANALYSIS")
print("="*50)

# SoM vs IHM overlap
som_ihm_overlap = find_overlaps(all_som_genes, IHM_genes, "SoM", "IHM")

# SoM vs Sepsis Candidates overlap
som_sepsis_overlap = find_overlaps(all_som_genes, Sepsis_Candidate_genes, "SoM", "Sepsis Candidates")

# IHM vs Sepsis Candidates overlap
ihm_sepsis_overlap = find_overlaps(IHM_genes, Sepsis_Candidate_genes, "IHM", "Sepsis Candidates")

# All three-way overlap
all_three = som_ihm_overlap.intersection(set(Sepsis_Candidate_genes))
if all_three:
    print(f"\nThree-way overlap (SoM + IHM + Sepsis Candidates):")
    for gene in sorted(all_three):
        print(f"  {gene}")
else:
    print(f"\nNo three-way overlap found")

SoM Gene Counts:
  Module_1: 11 genes
  Module_2: 13 genes
  Module_3: 10 genes
  Module_4: 8 genes

IHM Genes: 98 genes
Sepsis Candidate Genes: 27 genes

OVERLAP ANALYSIS

No overlap between SoM and IHM

Overlap between SoM and Sepsis Candidates:
  LCN2

No overlap between IHM and Sepsis Candidates

No three-way overlap found


In [8]:
ps_genes = pd.read_csv("../data/naive_gene_wt_score.csv")
ps_genes

Unnamed: 0,gene_name,naive_score_wt,qcut_percentile,gene_cluster
0,TNF,1.0,1,#ca0020
1,THBD,1.0,1,#ca0020
2,IL6R,1.0,1,#ca0020
3,IL6,1.0,1,#ca0020
4,TREM1,1.0,1,#ca0020
...,...,...,...,...
604,CD2,0.4,5,#1f78b4
605,CD247,0.4,5,#1f78b4
606,CD226,0.4,5,#1f78b4
607,BATF,0.4,5,#1f78b4


In [11]:
# calculate overlaps with PS genes# Extract PS genes into a set for quick lookup
ps_genes_set = set(ps_genes['gene_name'])
print("\n" + "="*50)
print("OVERLAP WITH PS GENES")
print("="*50) 
# SoM vs PS Genes overlap
som_ps_overlap = find_overlaps(all_som_genes, ps_genes_set, "SoM", "PS Genes")
# IHM vs PS Genes overlap
ihm_ps_overlap = find_overlaps(IHM_genes, ps_genes_set, "IHM", "PS Genes")
# Sepsis Candidates vs PS Genes overlap
sepsis_ps_overlap = find_overlaps(Sepsis_Candidate_genes, ps_genes_set, "Sepsis Candidates", "PS Genes")


OVERLAP WITH PS GENES

Overlap between SoM and PS Genes:
  AQP9
  AZU1
  BCL6
  CASP7
  CCL2
  CEACAM8
  DEFA4
  HLA-DPB1
  LCN2
  OASL
  ORM1
  SLPI
  TXN
  TYK2

Overlap between IHM and PS Genes:
  CCR7
  CD27
  ETS1
  MYC
  NT5E

Overlap between Sepsis Candidates and PS Genes:
  ADAM10
  ADAM17
  ADM
  ANGPT2
  C5AR1
  CD14
  CD38
  CD40LG
  CX3CR1
  CXCR2
  FOXO3
  HDAC6
  HIF1A
  HMGB1
  HMOX1
  IL10
  LCN2
  NFKB1
  NLRP3
  PDCD1
  PTX3
  S100A9
  SIRT1
  STAT3
  TLR4
  TREM1
  VWF
