In [7]:
import pandas as pd
import numpy as np
import scipy.stats

# Load input files
module_genes_df = pd.read_excel("gtexv6_Substantianigra_darkorange2_4-3-2025.xlsx")
full_region_genes_df = pd.read_excel("CoExp_gtexv6_Substantianigra_GO-Report (4) (genes expressed in SN).xlsx")

# Extract gene list from darkorange2 module
module_genes = module_genes_df['name'].dropna().unique().tolist()

# Extract all genes expressed in the region from the GO enrichment report
gene_sets_column = full_region_genes_df['Unnamed: 7'].dropna()
all_genes_in_region = set()
for gene_string in gene_sets_column:
    genes = [g.strip() for g in gene_string.split(',')]
    all_genes_in_region.update(genes)
all_genes_in_region = list(all_genes_in_region)

# Load lipid genes from text file
lipid_genes = []
with open("lipid_genes.txt", "r") as f:
    for line in f:
        line = line.strip()
        stripped_line = line.split(',')
        for gene in stripped_line:
            lipid_genes.append(gene.strip())

# Chi-square test function
def chi_square(region_module_gene_list, full_region_gene_list, gene_of_interest_list, region_module_name, gene_list_name):
    region_no_module = set(full_region_gene_list).difference(set(region_module_gene_list))
    region_no_module = list(region_no_module)

    region_module_interest_match = sum(gene in gene_of_interest_list for gene in region_module_gene_list)
    region_no_module_interest_match = sum(gene in gene_of_interest_list for gene in region_no_module)

    df_dic = {
        gene_list_name: [region_module_interest_match, region_no_module_interest_match],
        f'not {gene_list_name}': [
            len(region_module_gene_list) - region_module_interest_match,
            len(region_no_module) - region_no_module_interest_match
        ],
        'total': [len(region_module_gene_list), len(region_no_module)]
    }

    observed_df = pd.DataFrame(df_dic).rename({0: region_module_name, 1: f'not {region_module_name}'})
    observed_df.loc['total'] = observed_df.sum()
    observed_df = observed_df.astype('float')

    expected_df = observed_df.iloc[0:2, 0:2].copy()
    total_n = observed_df.iloc[2, 2]
    for rowInd in range(2):
        for colInd in range(2):
            row_marg = observed_df.iloc[rowInd, -1]
            col_marg = observed_df.iloc[-1, colInd]
            expected_df.iloc[rowInd, colInd] = (row_marg * col_marg) / total_n

    chi_df = expected_df.copy()
    for rowInd in range(2):
        for colInd in range(2):
            o = observed_df.iloc[rowInd, colInd]
            e = expected_df.iloc[rowInd, colInd]
            chi_df.iloc[rowInd, colInd] = (o - e) ** 2 / e

    chi_stat = chi_df.to_numpy().sum()
    return observed_df, expected_df, chi_df, chi_stat

results = chi_square(
    region_module_gene_list=module_genes,
    full_region_gene_list=all_genes_in_region,
    gene_of_interest_list=lipid_genes,
    region_module_name='Substantianigra_darkorange2',
    gene_list_name='Lipid genes'
)

# Calculate p-value
pval = scipy.stats.chi2.sf(results[3], 1)

print("Observed:")
print(results[0])
print("\nExpected:")
print(results[1])
print("\nChi-Square Contributions:")
print(results[2])
print(f"\nChi-square Statistic: {results[3]:.4f}")
print(f"P-value: {pval:.4e}")





Observed:
                                 Lipid genes  not Lipid genes    total
Substantianigra_darkorange2             21.0            632.0    653.0
not Substantianigra_darkorange2        419.0          10766.0  11185.0
total                                  440.0          11398.0  11838.0

Expected:
                                 Lipid genes  not Lipid genes
Substantianigra_darkorange2        24.270992       628.729008
not Substantianigra_darkorange2   415.729008     10769.270992

Chi-Square Contributions:
                                 Lipid genes  not Lipid genes
Substantianigra_darkorange2         0.440830         0.017017
not Substantianigra_darkorange2     0.025736         0.000994

Chi-square Statistic: 0.4846
P-value: 4.8636e-01


In [8]:
# Convert lists to sets for efficient set operations
module_genes_set = set(module_genes)
all_genes_set = set(all_genes_in_region)
lipid_genes_set = set(lipid_genes)

# 1. In module & in lipid genes
in_module_and_lipid = module_genes_set & lipid_genes_set

# 2. In module & not in lipid genes
in_module_not_lipid = module_genes_set - lipid_genes_set

# 3. Not in module & in lipid genes
not_module_and_lipid = (all_genes_set - module_genes_set) & lipid_genes_set

# 4. Not in module & not in lipid genes
not_module_not_lipid = (all_genes_set - module_genes_set) - lipid_genes_set

# Print out the genes in each category
print("\nGenes in module AND in lipid genes:")
print(sorted(in_module_and_lipid))

print("\nGenes in module AND NOT in lipid genes:")
print(sorted(in_module_not_lipid))

print("\nGenes NOT in module AND in lipid genes:")
print(sorted(not_module_and_lipid))

print("\nGenes NOT in module AND NOT in lipid genes:")
print(sorted(not_module_not_lipid))


Genes in module AND in lipid genes:
['ACHE', 'AGPAT2', 'CERK', 'CERS6', 'CSNK1G2', 'FABP6', 'FDX1', 'GRHL1', 'HPGD', 'INPP4B', 'OCRL', 'PHOSPHO1', 'PIP5K1B', 'PLD3', 'PRKACA', 'PTDSS1', 'SDC1', 'SEC23A', 'SLC44A5', 'SPHK2', 'SYNJ1']

Genes in module AND NOT in lipid genes:
['ABHD11', 'AC003973.1', 'AC005625.1', 'AC005789.11', 'AC006547.14', 'AC006547.15', 'AC007193.8', 'AC010761.8', 'AC018804.7', 'AC034220.3', 'AC046143.7', 'AC058791.1', 'AC064843.2', 'AC068491.1', 'AC068831.6', 'AC073641.2', 'AC104698.1', 'AC107016.1', 'AC107016.2', 'ACOT7', 'ADAMTS2', 'ADRBK2', 'AFF3', 'AGTR1', 'AHNAK2', 'AJAP1', 'AKAP12', 'AL132709.5', 'AL590710.1', 'ALDH1A1', 'ALPK3', 'AMIGO2', 'ANAPC13', 'ANK1', 'ANKRD29', 'ANKRD30BL', 'ANKRD50', 'ANXA6', 'AP003774.6', 'AP1M1', 'APBA1', 'APC', 'APOL2', 'AQP11', 'AR', 'ARHGAP24', 'ARHGAP6', 'ARHGEF11', 'ARHGEF28', 'ARL6IP1', 'ARMCX1', 'ARPC1A', 'ASAH2B', 'ASB16', 'ATP13A2', 'ATP2A3', 'ATP6V0B', 'ATP6V0D1', 'ATP6V0E2', 'ATP6V0E2-AS1', 'ATP6V1A', 'ATP6V1B2', 'ATP6V1

In [9]:
import os

# Define output folder and create it if it doesn't exist
output_folder = "chi_square_gene_lists"
os.makedirs(output_folder, exist_ok=True)

# Define gene groups
gene_groups = {
    "in_module_and_lipid.txt": sorted(in_module_and_lipid),
    "in_module_not_lipid.txt": sorted(in_module_not_lipid),
    "not_module_and_lipid.txt": sorted(not_module_and_lipid),
    "not_module_not_lipid.txt": sorted(not_module_not_lipid),
}

# Save each group to a separate file
for filename, genes in gene_groups.items():
    filepath = os.path.join(output_folder, filename)
    with open(filepath, "w") as f:
        for gene in genes:
            f.write(f"{gene}\n")

print(f"Saved gene lists to folder: {output_folder}")


Saved gene lists to folder: chi_square_gene_lists
