In [1]:
import pandas as pd

file_name = 'data/1-s2.0-S1097276521002665-mmc6.xlsx'
different_genes_xls = pd.ExcelFile(file_name)

In [2]:
xls_sheets = [sheets for sheets in different_genes_xls.sheet_names if sheets[:2] == '5B']

In [8]:
import joblib 
out_file_path = './'
gene_signature_dir = out_file_path + 'gene_signatures/'
out_filename = gene_signature_dir + 'metabolism_rxn_subsystem.pickle'
subsystem_to_genes = joblib.load(out_filename)
metabolism_sig = {key: [item[0] for item in subsystem_to_genes[key]] for key in subsystem_to_genes}


# Jaccard Index

In [9]:
for sheet_name in xls_sheets:
    df = pd.read_excel(file_name, sheet_name=sheet_name)
    diff_gene_set = set([item.upper() for item in df[df['log2FC']>1]['index'].tolist()])
    for pathway_key in metabolism_sig:
        pct_overlap = len(diff_gene_set.intersection(metabolism_sig[pathway_key]))/(len(diff_gene_set)+len(metabolism_sig[pathway_key]))
        if pct_overlap > 0:
            print(sheet_name, pathway_key, pct_overlap)

5BB_diffexpr-nonnaive-cluster2 Chondroitin synthesis 0.014084507042253521
5BD_diffexpr-nonnaive-cluster4 Nucleotide interconversion 0.01858736059479554
5BD_diffexpr-nonnaive-cluster4 Folate metabolism 0.0056179775280898875
5BD_diffexpr-nonnaive-cluster4 Glycine, serine, alanine and threonine metabolism 0.005263157894736842
5BD_diffexpr-nonnaive-cluster4 Pyruvate metabolism 0.005025125628140704
5BD_diffexpr-nonnaive-cluster4 Eicosanoid metabolism 0.005376344086021506
5BD_diffexpr-nonnaive-cluster4 Methionine and cysteine metabolism 0.0053475935828877
5BD_diffexpr-nonnaive-cluster4 Glyoxylate and dicarboxylate metabolism 0.00510204081632653
5BD_diffexpr-nonnaive-cluster4 Propanoate metabolism 0.005434782608695652
5BF_diffexpr-nonnaive-cluster6 Nucleotide interconversion 0.007518796992481203
5BG_diffexpr-nonnaive-cluster8 Nucleotide interconversion 0.011764705882352941
5BH_diffexpr-nonnaive-cluster9 Inositol phosphate metabolism 0.0078125
5BH_diffexpr-nonnaive-cluster9 Pyruvate metabolism

In [11]:
from utils import hypergeometric_test, adjust_p_value_fdr

M = 9822
res = {'cluster_name': [], 'pathway_name': [], 'p-value': []}
for sheet_name in xls_sheets:
    df = pd.read_excel(file_name, sheet_name=sheet_name)
    diff_gene_set = set([item.upper() for item in df[df['log2FC']>1]['index'].tolist()])
    for pathway_key in metabolism_sig:
        n = len(metabolism_sig[pathway_key])
        N = len(diff_gene_set)
        x = len(diff_gene_set.intersection(metabolism_sig[pathway_key]))
        pct_overlap = x/(N+n)
        
        if pct_overlap > 0:
            p_val = hypergeometric_test(total_genes_expressed=M, n_genes_of_interest=n, 
                                        n_genes_picked=N, n_overlap=x)
            res['cluster_name'].append(sheet_name)
            res['pathway_name'].append(pathway_key)
            res['p-value'].append(p_val)
        else:
            res['cluster_name'].append(sheet_name)
            res['pathway_name'].append(pathway_key)
            res['p-value'].append(1)

In [12]:
df = pd.DataFrame.from_dict(res)

In [13]:
p_adj = adjust_p_value_fdr(df['p-value'])

In [14]:
df['p-adj'] = p_adj

In [15]:
df[df['p-adj']<0.05]

Unnamed: 0,cluster_name,pathway_name,p-value,p-adj
1378,5BR_diffexpr-nonnaive-cluster20,Nucleotide interconversion,0.007942,0.025571


In [16]:
df

Unnamed: 0,cluster_name,pathway_name,p-value,p-adj
0,5BA_diffexpr-nonnaive-cluster1,Fatty acid oxidation,1.0,1.0
1,5BA_diffexpr-nonnaive-cluster1,Nucleotide interconversion,1.0,1.0
2,5BA_diffexpr-nonnaive-cluster1,Keratan sulfate degradation,1.0,1.0
3,5BA_diffexpr-nonnaive-cluster1,Sphingolipid metabolism,1.0,1.0
4,5BA_diffexpr-nonnaive-cluster1,Tyrosine metabolism,1.0,1.0
...,...,...,...,...
1534,5BS_diffexpr-nonnaive-cluster21,"Transport, lysosomal",1.0,1.0
1535,5BS_diffexpr-nonnaive-cluster21,Vitamin B12 metabolism,1.0,1.0
1536,5BS_diffexpr-nonnaive-cluster21,N-glycan synthesis,1.0,1.0
1537,5BS_diffexpr-nonnaive-cluster21,D-alanine metabolism,1.0,1.0
