In [1]:
#importing all the necessary packages 
import csv 
import pandas as pd 
import requests
import time 
from concurrent.futures import ThreadPoolExecutor, as_completed
from Bio.KEGG import REST
import csv
import os
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:

# loading the metabolites file
csv_path = "C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/metabolites.csv"
df_metabolites = pd.read_csv(csv_path)
print(df_metabolites)

#loading the gene file ( treated vs untreated )
df_genes= pd.read_csv("C:/Users/Vaishnavi Raikar/Downloads/treated_vs_untreated.csv", delimiter = ";")
print(df_genes.head())

                     Metabolite
0          3-hydroxyisobutyrate
1                           AMP
2               Acetylcarnitine
3                     Adenosine
4                       Alanine
5                    Asparagine
6                     Aspartate
7                       Betaine
8                       Choline
9                       Citrate
10                     Creatine
11              Dimethylglycine
12                     Fumarate
13                      Glucose
14                    Glutamate
15                    Glutamine
16                      Glycine
17                    Histidine
18                 Hypoxanthine
19                          IMP
20                      Inosine
21                   Isoleucine
22                      Lactate
23                      Leucine
24                       Malate
25                   Methionine
26                 Myo-inositol
27                          NAD
28                  Niacinamide
29                Phenylalanine
30      

In [3]:

#converting p-values which might be incorrectly formatted as strings 
def convert_p_value(p_val):
    if isinstance(p_val, str):
        p_val = p_val.replace(',', '.')  #replacing the commas with dots 
    try:
        return float(p_val)
    except ValueError:
        return None

#using process_data function to filter the p-values < 0.01
def process_data(df_genes):
    #applying the convert_p_value function
    df_genes['P.Value'] = df_genes['P.Value'].apply(convert_p_value)
    
    #dropping the rows with invalid P values which cannot be converted
    df_genes = df_genes.dropna(subset=['P.Value'])
    
    #filtering the genes with P value < 0.1
    df_filtered_genes = df_genes[df_genes['P.Value'] < 0.1]
    
    #sorting the dataframe by raw P values in ascending order
    sorted_genes = df_filtered_genes.sort_values(by='P.Value', ascending=True)
    
    #selecting the top 500 genes with the lowest P values
    top_genes = sorted_genes.head(500)
    
    return top_genes

top_genes_df = process_data(df_genes)

#displaying the top 500 genes sorted by P value
print(top_genes_df[['feature', 'P.Value']])

#saving the results to a new csv file 
output_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/top_genes_df.csv'
top_genes_df.to_csv(output_file_path, index=False)


                feature   P.Value
45   ENSMUSG00000025196  0.000101
47   ENSMUSG00000037887  0.000103
49   ENSMUSG00000080115  0.000107
52   ENSMUSG00000034855  0.000111
51   ENSMUSG00000022877  0.000113
..                  ...       ...
542  ENSMUSG00000022951  0.009248
544  ENSMUSG00000044950  0.009330
543  ENSMUSG00000031930  0.009339
546  ENSMUSG00000025059  0.009448
547  ENSMUSG00000032532  0.009450

[500 rows x 2 columns]


Mapping the identifiers ( Ensembl IDs to KEGG IDs )

In [4]:

#fetching the entrez ID from Ensembl ID 
def get_entrez_id(ensembl_id):
    server = "https://rest.ensembl.org"
    ext = f"/xrefs/id/{ensembl_id}"
    headers = {"Content-Type": "application/json"}
    try:
        # Sending GET request to Ensembl REST API to retrieve Entrez IDs
        r = requests.get(f"{server}{ext}", headers=headers)
        r.raise_for_status()  #raising an HTTPError for bad responses
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error for {ensembl_id}: {err}")
        return None

#parsing JSON response to extract Entrez IDs
    decoded = r.json()
    entrez_ids = [entry['primary_id'] for entry in decoded if entry['dbname'] == 'EntrezGene']
    return entrez_ids[0] if entrez_ids else None

#fetching the KEGG ID from Entrez ID
def kegg_id_from_entrez(entrez_id, api_key=None):
    url = f"http://rest.kegg.jp/conv/genes/ncbi-geneid:{entrez_id}"
    headers = {}
    if api_key:  #adding the api_key if available 
        headers['Authorization'] = f'Bearer {api_key}'
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error for Entrez ID {entrez_id}: {err}")
        return None

    if '\t' in response.text:
        kegg_id = response.text.split('\t')[1].strip()
        return kegg_id
    else:
        return None

#processing a single ensembl ID 
def single_id_processing(ensembl_id, api_key=None):
    entrez_id = get_entrez_id(ensembl_id)  #fetching Entrez ID from Ensembl ID
    if entrez_id:
        time.sleep(0.5)  #pausing the execution of the program to prevent clashing of events. 
        kegg_id = kegg_id_from_entrez(entrez_id, api_key)
        if kegg_id:
            return ensembl_id, kegg_id #if successful, return both Ensembl and KEGG IDs 
    return ensembl_id, None #returning Ensembl ID and None if the KEGG ID is not found

# Function to process all Ensembl IDs
def all_ids_processing(ensemble_ids, api_key=None):
    all_results = []
    for ensembl_id in ensemble_ids:
        ensembl_id, kegg_id = single_id_processing(ensembl_id, api_key) #processing each Ensembl ID
        if kegg_id:
            all_results.append({'Ensembl_ID': ensembl_id, 'KEGG_ID': kegg_id}) #appending the result if KEGG ID is found
        else:
            all_results.append({'Ensembl_ID': ensembl_id, 'KEGG_ID': 'Not found'})  #appending 'Not found' if KEGG ID is missing
        print(f"Processed {ensembl_id}")
    return all_results

#this is the list of Ensembl IDs to process
ensemble_ids = top_genes_df['feature'].tolist()
results = all_ids_processing(ensemble_ids)

#saving the results to a new csv file 
results_df = pd.DataFrame(results)
results_df.to_csv('C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/ensembl_to_kegg_ids.csv', index=False)
print(results_df)


Processed ENSMUSG00000025196
Processed ENSMUSG00000037887
Processed ENSMUSG00000080115
Processed ENSMUSG00000034855
Processed ENSMUSG00000022877
Processed ENSMUSG00000004356
Processed ENSMUSG00000020330
Processed ENSMUSG00000027490
Processed ENSMUSG00000020083
Processed ENSMUSG00000026981
Processed ENSMUSG00000022026
Processed ENSMUSG00000037440
Processed ENSMUSG00000071547
Processed ENSMUSG00000028937
Processed ENSMUSG00000051262
Processed ENSMUSG00000046840
Processed ENSMUSG00000048572
Processed ENSMUSG00000097927
Processed ENSMUSG00000023015
Processed ENSMUSG00000041939
Processed ENSMUSG00000019942
Processed ENSMUSG00000028356
Processed ENSMUSG00000032218
Processed ENSMUSG00000027306
Processed ENSMUSG00000028970
Processed ENSMUSG00000042029
Processed ENSMUSG00000000934
Processed ENSMUSG00000072949
Processed ENSMUSG00000070661
Processed ENSMUSG00000023505
Processed ENSMUSG00000032081
Processed ENSMUSG00000024521
Processed ENSMUSG00000042041
Processed ENSMUSG00000031016
Processed ENSM

Pre - processing of the Ensembl IDs - Removing the Ensembl IDs where the KEGG IDs are not found. 

In [5]:
""" Dropping the ids where the kegg id is not found and saving it to a new file """

ensemble_kegg_ids = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/ensembl_to_kegg_ids.csv'
ensemble_file = pd.read_csv(ensemble_kegg_ids)
#dropping the rows where 'KEGG_ID' is 'not found' or NaN
df_cleaned = ensemble_file.dropna(subset=['KEGG_ID'])
df_cleaned = df_cleaned[df_cleaned['KEGG_ID'].str.lower() != 'not found']

#saving the cleaned data to a new CSV file
output_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/cleaned_ensembl_to_kegg_ids.csv'
df_cleaned.to_csv(output_file_path, index=False)

print(output_file_path)
print(df_cleaned)

C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/cleaned_ensembl_to_kegg_ids.csv
             Ensembl_ID        KEGG_ID
0    ENSMUSG00000025196      mmu:93721
1    ENSMUSG00000037887      mmu:18218
2    ENSMUSG00000080115  mmu:100504608
3    ENSMUSG00000034855      mmu:15945
4    ENSMUSG00000022877      mmu:94175
..                  ...            ...
495  ENSMUSG00000022951      mmu:54720
496  ENSMUSG00000044950      mmu:70802
497  ENSMUSG00000031930      mmu:66894
498  ENSMUSG00000025059      mmu:14933
499  ENSMUSG00000032532      mmu:12424

[471 rows x 2 columns]


Total Ensembl IDs ( p-value < 0.01 ) - 500 , 
Total Ensembl IDs mapped to KEGG IDs - 471 , 
Total Ensemble IDs not mapped to KEGG IDs - 29 

Finding the mmu pathways associated with each KEGG ID. 

In [6]:
#using the main function to fetch the pathways for the pre processed file - cleaned_ensembl_to_kegg_ids.csv
def main(file_path, output_file):
    def get_kegg_pathways(kegg_ids):
        base_url = "http://rest.kegg.jp/link/pathway/"
        pathways = {}
        for kegg_id in kegg_ids:
            response = requests.get(base_url + kegg_id)
            if response.status_code == 200:
                pathway_data = response.text.split('\n')
                pathway_ids = [line.split('\t')[1] for line in pathway_data if line]
                pathways[kegg_id] = pathway_ids
            else:
                pathways[kegg_id] = ["Error: Unable to retrieve data"]
        return pathways

    #loading and cleaning the initial data 
    df = pd.read_csv(file_path)
    kegg_ids = df['KEGG_ID'].unique()

    #Retrieving pathways for KEGG IDs
    pathways = get_kegg_pathways(kegg_ids)

    #mapping the pathways to KEGG IDs and saving them 
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Ensembl_ID', 'KEGG_ID', 'Pathways'])
        for index, row in df.iterrows():
            ensembl_id = row['Ensembl_ID']
            kegg_id = row['KEGG_ID']
            pathway_list = pathways.get(kegg_id, [])
            pathways_str = '[' + ', '.join(f"'{pathway}'" for pathway in pathway_list) + ']' if pathway_list else '[]'
            writer.writerow([ensembl_id, kegg_id, pathways_str])

    #loading, cleaning and exploding the new data. 
    data = pd.read_csv(output_file)
    data['Pathways'] = data['Pathways'].str.replace('[\[\]\' ]', '', regex=True).str.split(',')
    data_exploded = data.explode('Pathways')
    data_exploded['Pathways'] = data_exploded['Pathways'].str.replace('path:', '')
    data_exploded = data_exploded.rename(columns={'Pathways': 'Modified Pathway IDs'})
    data_exploded[['Ensembl_ID', 'KEGG_ID', 'Modified Pathway IDs']].to_csv(output_file, index=False)


main("C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/cleaned_ensembl_to_kegg_ids.csv", 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/ensembl_to_kegg_with_pathways.csv')


Finding the mmu pathways associated with each compound ID. 

In [11]:

def get_kegg_id(metabolite):
    url = f"https://rest.kegg.jp/find/compound/{metabolite}"
    response = requests.get(url)
    if response.status_code == 200:
        first_line = response.text.split('\n')[0]
        if first_line:
            kegg_id = first_line.split('\t')[0]
            # Remove 'cpd:' prefix from the kegg_id
            kegg_id = kegg_id.replace('cpd:', '')
            return kegg_id
    return None

#retrieving KEGG IDs for metabolites
metabolite_id_list = [get_kegg_id(metabolite) for metabolite in df_metabolites['Metabolite'] if get_kegg_id(metabolite)]
print("Metabolite KEGG IDs:", metabolite_id_list)

def main_process(final_output_path, metabolite_id_list):
    #fetching all mouse pathways and checking compounds in each mouse pathway
    def get_mouse_pathways():
        url = "https://rest.kegg.jp/list/pathway/mmu"
        response = requests.get(url)
        compound_pathway_map = {}
        if response.status_code == 200:
            pathways = response.text.split('\n')
            for pathway in pathways:
                if pathway:
                    pathway_id, _ = pathway.split('\t')
                    check_compounds_in_pathway(pathway_id, compound_pathway_map)
        else:
            print("Failed to retrieve data")
        return compound_pathway_map

    #using check_compounds_in_pathway function to check if a compound is in a pathway and map them
    def check_compounds_in_pathway(pathway_id, compound_pathway_map):
        url = f"https://rest.kegg.jp/get/{pathway_id}"
        response = requests.get(url)
        if response.status_code == 200:
            for compound in metabolite_id_list:
                if compound in response.text:
                    if compound not in compound_pathway_map:
                        compound_pathway_map[compound] = []
                    compound_pathway_map[compound].append(pathway_id)

    # using cwrite_to_csv function to write the compound-pathway mapping to csv file.
    def write_to_csv(compound_pathway_map, filename):
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Compound ID', 'Compound Name', 'Pathway ID', 'Pathway Name'])
            for compound in compound_pathway_map:
                compound_name = get_compound_name(compound)
                for pid in compound_pathway_map[compound]:
                    pathway_name = get_pathway_name(pid)
                    writer.writerow([compound, compound_name, pid, pathway_name])

    #fetching the name of the compound using the function get_compound_name
    def get_compound_name(compound_id):
        url = f"https://rest.kegg.jp/get/{compound_id}"
        response = requests.get(url)
        if response.status_code == 200:
            lines = response.text.split('\n')
            for line in lines:
                if line.startswith("NAME"):
                    name_line = line.split('       ')[1]
                    name = name_line.split(';')[0].strip()
                    return name
        return "Unknown"

    #fetching the name of a pathway using the function get_pathway_name 
    def get_pathway_name(pathway_id):
        url = f"https://rest.kegg.jp/get/{pathway_id}"
        response = requests.get(url)
        if response.status_code == 200:
            lines = response.text.split('\n')
            for line in lines:
                if line.startswith("NAME"):
                    name_line = line.split('       ')[1]
                    name = name_line.split(';')[0].strip()
                    return name
        return "Unknown"

    #calling the sub function get_mouse_pathways 
    compound_pathway_map = get_mouse_pathways()
    write_to_csv(compound_pathway_map, final_output_path)

    return final_output_path

#calling the main function and saving the metabolite and pathway information in a new csv file 
output_csv_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/final_compound_pathway_mapping.csv'
final_path = main_process(output_csv_path, metabolite_id_list)
print(f"Processed data saved in: {final_path}")


Metabolite KEGG IDs: ['C01188', 'C00020', 'C02571', 'C00002', 'C00041', 'C00152', 'C00049', 'C00318', 'C00114', 'C00158', 'C00300', 'C01026', 'C00122', 'C00029', 'C00025', 'C00064', 'C00037', 'C00135', 'C00262', 'C00130', 'C00081', 'C00407', 'C00186', 'C00123', 'C00149', 'C00019', 'C00137', 'C00003', 'C00153', 'C00079', 'C00157', 'C00346', 'C00157', 'C00245', 'C00188', 'C00082', 'C00043', 'C00105', 'C00106', 'C00015', 'C00141']
Processed data saved in: C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/final_compound_pathway_mapping.csv


Common mouse ( "mmu:") pathways found between the Genes and the Metabolites.

In [12]:
#using the two csv output files 
compounds_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/final_compound_pathway_mapping.csv'
genes_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/ensembl_to_kegg_with_pathways.csv'

compounds_data = pd.read_csv(compounds_file_path, delimiter=',')
genes_data = pd.read_csv(genes_file_path, delimiter=',')

#extracting the unique pathway IDs
compound_pathways = compounds_data['Pathway ID'].dropna().unique()
gene_pathway_ids = genes_data['Modified Pathway IDs'].dropna().unique()

#finding the common pathway IDs
common_ids = set(compound_pathways).intersection(gene_pathway_ids)
print("Common IDs:", common_ids)

#creating a list to store the combined information
common_data_list = []

for pathway_id in common_ids:
    #finding the Compound IDs and Gene KEGG IDs associated with the common pathway ID
    compound_ids = compounds_data[compounds_data['Pathway ID'] == pathway_id]['Compound ID'].dropna().unique()
    gene_kegg_ids = genes_data[genes_data['Modified Pathway IDs'] == pathway_id]['KEGG_ID'].dropna().unique()

    #combining the KEGG and Compound IDs into a single string
    associated_ids = ', '.join(list(gene_kegg_ids) + list(compound_ids))

    #creating a dictionary for the row and adding it to the list
    row = {'Pathway ID': pathway_id, 'Associated IDs': associated_ids}
    common_data_list.append(row)

#converting the list of dictionaries to a DataFrame
common_data = pd.DataFrame(common_data_list)

#saving the dataFrame to a new CSV file
common_data.to_csv('C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/common_pathway_associated_ids.csv', index=False)
print("Data saved to common_pathway_associated_ids.csv")

Common IDs: {'mmu04024', 'mmu04211', 'mmu00330', 'mmu00250', 'mmu04713', 'mmu04148', 'mmu05211', 'mmu00350', 'mmu04066', 'mmu00450', 'mmu04727', 'mmu00770', 'mmu04723', 'mmu04728', 'mmu05032', 'mmu00410', 'mmu04911', 'mmu00260', 'mmu04925', 'mmu04931', 'mmu04978', 'mmu00640', 'mmu00280', 'mmu00053', 'mmu00040', 'mmu00590', 'mmu00290', 'mmu00600', 'mmu05016', 'mmu05231', 'mmu00270', 'mmu00591', 'mmu00561', 'mmu04720', 'mmu04611', 'mmu00340', 'mmu05033', 'mmu04070', 'mmu00785', 'mmu05030', 'mmu04068', 'mmu04936', 'mmu00500', 'mmu05012', 'mmu02010', 'mmu00480', 'mmu05017', 'mmu00650', 'mmu05022', 'mmu04725', 'mmu04714', 'mmu00920', 'mmu04216', 'mmu04022', 'mmu05031', 'mmu00592', 'mmu04152', 'mmu00310', 'mmu04072', 'mmu00240', 'mmu04974', 'mmu00860', 'mmu00562', 'mmu05133', 'mmu00120', 'mmu00360', 'mmu00630', 'mmu04020', 'mmu00730', 'mmu00910', 'mmu05014', 'mmu00230', 'mmu05415', 'mmu05208', 'mmu04750', 'mmu04977', 'mmu04142', 'mmu04922', 'mmu04928', 'mmu04934', 'mmu05200', 'mmu00052', 'mm

Updating the common pathways table with 'Metabolite ID Count' , 'KEGG ID Count' , 'Pathway Size', 'Pathway Name', and 'Associated Genes'.

In [14]:

def main(csv_file_path, output_csv_file_path):
    #loading the csv file 
    df_update_common = pd.read_csv(csv_file_path)
    
    #initializing the columns in the DataFrame
    df_update_common['Metabolite ID Count'] = 0
    df_update_common['KEGG ID Count'] = 0
    df_update_common['Pathway Size'] = 0
    df_update_common['Pathway Name'] = ''
    df_update_common['Associated Genes'] = ''  
    
    #defining a function to separate IDs and count them
    def separate_and_count_ids(associated_ids):
        metabolite_ids = []
        kegg_ids = []
        for id in associated_ids.split(', '):
            if id.startswith('C'):
                metabolite_ids.append(id)
            else:
                kegg_ids.append(id)
        return len(metabolite_ids), len(kegg_ids)
    
    #defining a function to get pathway information and associated genes from KEGG API
    def get_pathway_details_and_genes(pathway_id):
        url = f"http://rest.kegg.jp/get/{pathway_id}"
        response = requests.get(url)
        if response.ok:
            text = response.text
            pathway_size = text.count('KO:')
            name_line = next((line for line in text.split('\n') if line.startswith("NAME")), "NAME: Unknown")
            pathway_name = name_line.split(' ', 1)[1] if name_line else 'Unknown'
            
            genes = []
            current_section = None
            for line in text.split("\n"):
                section = line[:12].strip()
                if section:
                    current_section = section
                if current_section == "GENE":
                    genes_info = line[12:].strip()
                    if genes_info:
                        gene_id = genes_info.split(' ')[0]
                        genes.append(gene_id)
            
            return pathway_size, pathway_name, ', '.join(genes)
        else:
            print(f"Failed to get data for pathway: {pathway_id}")
            return 0, 'Unknown', ''
    
    #applying the fucntions to each row 
    for index, row in df_update_common.iterrows():
        metabolite_count, kegg_count = separate_and_count_ids(row['Associated IDs'])
        pathway_size, pathway_name, associated_genes = get_pathway_details_and_genes(row['Pathway ID'])
        df_update_common.at[index, 'Metabolite ID Count'] = metabolite_count
        df_update_common.at[index, 'KEGG ID Count'] = kegg_count
        df_update_common.at[index, 'Pathway Size'] = pathway_size
        df_update_common.at[index, 'Pathway Name'] = pathway_name
        df_update_common.at[index, 'Associated Genes'] = associated_genes
    
    #dropping the associate ID column and the metabolite ID count because its no longer needed.
    df_update_common.drop(columns=['Associated IDs'], inplace=True)
    df_update_common.drop(columns=['Metabolite ID Count'], inplace=True)
    df_update_common.to_csv(output_csv_file_path, index=False)

csv_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/common_pathway_associated_ids.csv'  
output_csv_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/updated_common_pathway_associated_ids.csv' 
main(csv_file_path, output_csv_file_path)

Performing the hypergeometric tests to find the significant pathways. 

In [15]:

input_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/updated_common_pathway_associated_ids.csv'  
data = pd.read_csv(input_file_path)

#Total number of genes in the background population 
M = 25995
#Total number of genes in the updated_common_pathway_associated_ids dataset. 
N = 120
#Pathway size = Total number of mouse genes in each pathway 
#KEGG ID Count = Number of genes that are also in the list and in the pathway
#calculating the hypergeometric test for each pathway 
data['P-Value'] = data.apply(lambda row: hypergeom.sf(row['KEGG ID Count'] - 1, M, row['Pathway Size'], N), axis=1)

#sorting the p-value in ascending order
sorted_data = data.sort_values(by='P-Value')

output_data = sorted_data[['Pathway ID', 'Pathway Name', 'Pathway Size', 'KEGG ID Count', 'Associated Genes', 'P-Value']]

output_file_path = 'C:/Users/Vaishnavi Raikar/OneDrive/Desktop/MASTERS THESIS - VAISHNAVI/multiOmicsAnalysis/pathway_enrichment_analysis.csv' 

#saving the file 
try:
    output_data.to_csv(output_file_path, index=False)
except Exception as e:
    print(f"An error occurred: {e}")