<a href="https://colab.research.google.com/github/vaishnavi132/testrepository/blob/main/kegg_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

"""loading the data and selecting the top 30 genes based on their expression level"""

#sorting and selecting only the genes which has the highest expression level
gene_ids_path = "/content/all_samples.fragments_per_gene.tsv"
gene_ids_df = pd.read_csv(gene_ids_path, sep='\t')

#adding all the expression data and then sorting the genes based on their total expression
gene_ids_df['Total_Expression'] = gene_ids_df.iloc[:, 1:].sum(axis=1)
top_genes = gene_ids_df.sort_values(by='Total_Expression', ascending=False).head(40)

top_genes[['feature', 'Total_Expression']]

# removing non-gene entries
non_gene_entries = ['__no_feature', '__alignment_not_unique', '__ambiguous']
filtered_genes = top_genes[~top_genes['feature'].isin(non_gene_entries)]

# printing the top 30 genes
actual_top_genes = filtered_genes.head(40)
actual_top_genes[['feature', 'Total_Expression']]

# Extracting only the Ensembl gene IDs from the final list
ensemble_ids = actual_top_genes['feature'].tolist()
ensemble_ids

# metabolite file
metabolite_ids_path = "/content/metabolites.tsv"
metabolite_ids_df = pd.read_csv(metabolite_ids_path, sep='\t')
metabolite_ids_list = metabolite_ids_df.iloc[:, 0].tolist()

In [None]:
"""conversion of ensemble ids to kegg ids via entrez ids """

#here we are converting the ensemble id to kegg id by converting it first to entrez id and then to the kegg id
#and then we make a results dataframe which will add all the ensemble ids in a dataframe

# using kegg api to convert the ensemble id to kegg id and then
# converting the entrez id to  kegg id
def get_entrez_id(ensembl_id):
    server = "https://rest.ensembl.org"
    ext = f"/xrefs/id/{ensembl_id}"
    headers = { "Content-Type" : "application/json"}
    r = requests.get(f"{server}{ext}", headers=headers)
    if not r.ok:
        r.raise_for_status()
        return None
    decoded = r.json()
    entrez_ids = [entry['primary_id'] for entry in decoded if entry['dbname'] == 'EntrezGene']
    return entrez_ids[0] if entrez_ids else None

def get_kegg_id_from_entrez(entrez_id):
    url = f"http://rest.kegg.jp/conv/genes/ncbi-geneid:{entrez_id}"
    response = requests.get(url)
    if response.ok and '\t' in response.text:
        kegg_id = response.text.split('\t')[1].strip()
        return kegg_id
    else:
        return None
def process_single_id(ensembl_id):
    try:
        entrez_id = get_entrez_id(ensembl_id)
        if entrez_id:
            kegg_id = get_kegg_id_from_entrez(entrez_id)
            if kegg_id:  # Checking if KEGG ID is found
                return ensembl_id, kegg_id
    except Exception as e:
        print(f"Error processing {ensembl_id}: {e}")
    return None, None

results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_ensembl_id = {executor.submit(process_single_id, ensembl_id): ensembl_id for ensembl_id in ensemble_ids}

    for future in as_completed(future_to_ensembl_id):
        ensembl_id, kegg_id = future.result()
        if kegg_id:  # Append only if KEGG ID is found
            results.append({'Ensembl_ID': ensembl_id, 'KEGG_ID': kegg_id})
            #print(f'Processed {ensembl_id} to {kegg_id}')
            #print(kegg_id)
# Extracting KEGG IDs from the results list
kegg_id = [entry['KEGG_ID'] for entry in results]
results_df = pd.DataFrame(kegg_id)
print(results_df)
results_df.to_csv('/content/ensembl_to_kegg_ids.csv', index=False)

             0
0    mmu:17721
1    mmu:17711
2    mmu:17716
3    mmu:16483
4    mmu:14778
5    mmu:17719
6    mmu:17717
7    mmu:17708
8    mmu:14319
9    mmu:20505
10   mmu:13627
11  mmu:233799
12  mmu:230163
13   mmu:11928
14   mmu:14325
15  mmu:110213
16   mmu:26458
17   mmu:14775
18   mmu:11947
19   mmu:22242
20   mmu:11898
21   mmu:13629
22   mmu:13120
23   mmu:14433
24   mmu:11936
25   mmu:58810
26   mmu:66054
27   mmu:20363
28   mmu:71670
29   mmu:18263
30   mmu:11931
31   mmu:15481
32   mmu:21859
33   mmu:20521


In [None]:

"""Extracting the pathways associated with each gene kegg id """

def get_pathways_for_gene_kegg_id(kegg_gene_id):
    url = f"http://rest.kegg.jp/link/pathway/{kegg_gene_id}"
    response = requests.get(url)
    if response.ok:
        # Extracting the pathway IDs from the response
        pathway_ids = set()
        for line in response.text.strip().split('\n'):
            if '\t' in line:
                parts = line.split('\t')
                pathway_id = parts[1].strip()  # Extract the pathway ID
                pathway_ids.add(pathway_id)
        return pathway_ids
    else:
        print(f"Failed to get the pathways for the kegg ids {kegg_gene_id}")
        return set()

gene_pathways = {}

for gene_id in kegg_id:
    pathways = get_pathways_for_gene_kegg_id(gene_id)
    gene_pathways[gene_id] = pathways

for gene_id, pathways in gene_pathways.items():
    print(f"Gene {gene_id} : {pathways}")

# Convert the gene_pathways dictionary to a list of dictionaries for DataFrame conversion
data_for_df = [{'Gene_ID': gene_id, 'Pathways': ', '.join(pathways)} for gene_id, pathways in gene_pathways.items()]

# Create DataFrame
gene_pathways_df = pd.DataFrame(data_for_df)

# Display the DataFrame
print(gene_pathways_df)


Gene mmu:17721 : {'path:mmu01100', 'path:mmu04714', 'path:mmu05014', 'path:mmu05012', 'path:mmu00190', 'path:mmu04723', 'path:mmu05020', 'path:mmu05415', 'path:mmu05022', 'path:mmu05208', 'path:mmu05016', 'path:mmu05010'}
Gene mmu:17711 : {'path:mmu01100', 'path:mmu04714', 'path:mmu05014', 'path:mmu05012', 'path:mmu00190', 'path:mmu05016', 'path:mmu04260', 'path:mmu05020', 'path:mmu05415', 'path:mmu05022', 'path:mmu05208', 'path:mmu04932', 'path:mmu05010'}
Gene mmu:17716 : {'path:mmu01100', 'path:mmu04714', 'path:mmu05014', 'path:mmu05012', 'path:mmu00190', 'path:mmu04723', 'path:mmu05020', 'path:mmu05415', 'path:mmu05022', 'path:mmu05208', 'path:mmu05016', 'path:mmu05010'}
Gene mmu:16483 : set()
Gene mmu:14778 : {'path:mmu00480', 'path:mmu01100', 'path:mmu05014', 'path:mmu05022', 'path:mmu04918', 'path:mmu05016'}
Gene mmu:17719 : {'path:mmu01100', 'path:mmu04714', 'path:mmu05014', 'path:mmu05012', 'path:mmu00190', 'path:mmu04723', 'path:mmu05020', 'path:mmu05415', 'path:mmu05022', 'pa

In [None]:

# creating a function get_metabolites_for_pathway to get metabolites for a given pathway
def get_metabolites_for_pathway(pathway_id):
    url = f"http://rest.kegg.jp/get/{pathway_id}"
    response = requests.get(url)
    metabolites = set()

    if response.ok:
        #here we are extracting the metabolites id which starts with C
        for line in response.text.split('\n'):
            if line.startswith("COMPOUND"):
                metabolite_ids = line.split()[1:]
                metabolites.update(metabolite_ids)
    else:
        print(f"Failed to retrieve metabolites for pathway {pathway_id}")

    return metabolites

gene_metabolites = {}

for gene_id, pathways in gene_pathways.items():
    all_metabolites = set()
    for pathway_id in pathways:
        metabolites = get_metabolites_for_pathway(pathway_id)
        all_metabolites.update(metabolites)
    gene_metabolites[gene_id] = all_metabolites

# Printing the results
for gene_id, metabolites in gene_metabolites.items():
    print(f"Gene {gene_id} is associated with metabolites: {metabolites}")




Gene mmu:17721 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C00001', 'C00025', 'Hydrogen', 'L-Glutamate', 'C00027', 'C00002'}
Gene mmu:17711 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C00001', 'Calcium', 'D-Glucose', 'C00025', 'cation', 'Hydrogen', 'L-Glutamate', 'C00076', 'C00027', 'C00002', 'C00031'}
Gene mmu:17716 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C00001', 'C00025', 'Hydrogen', 'L-Glutamate', 'C00027', 'C00002'}
Gene mmu:16483 is associated with metabolites: set()
Gene mmu:14778 is associated with metabolites: {'H2O', 'NADPH', 'L-Glutamate', 'C00001', 'C00025', 'C00005'}
Gene mmu:17719 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C00001', 'C00025', 'Hydrogen', 'L-Glutamate', 'C00027', 'C00002'}
Gene mmu:17717 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C00001', 'C00025', 'Hydrogen', 'L-Glutamate', 'C00027', 'C00002'}
Gene mmu:17708 is associated with metabolites: {'H2O', 'ATP', 'peroxide', 'C000

In [None]:

# Function to integrate genes and metabolites for enriched pathways
def integrate_gene_metabolites_for_pathways(gene_pathways, gene_metabolites):
    pathway_integration = {}
    for gene_id, pathways in gene_pathways.items():
        for pathway in pathways:
            if pathway not in pathway_integration:
                pathway_integration[pathway] = {'genes': set(), 'metabolites': set()}
            pathway_integration[pathway]['genes'].add(gene_id)
            pathway_integration[pathway]['metabolites'].update(gene_metabolites[gene_id])
    return pathway_integration

# Integrating the data
integrated_data = integrate_gene_metabolites_for_pathways(gene_pathways, gene_metabolites)

# Displaying the integrated data
for pathway, data in integrated_data.items():
    print(f"Pathway {pathway} involves genes {data['genes']} and metabolites {data['metabolites']}")


Pathway path:mmu01100 involves genes {'mmu:18263', 'mmu:17711', 'mmu:17717', 'mmu:11898', 'mmu:17721', 'mmu:58810', 'mmu:17708', 'mmu:17719', 'mmu:66054', 'mmu:14433', 'mmu:230163', 'mmu:17716', 'mmu:14778', 'mmu:233799', 'mmu:11947', 'mmu:14775'} and metabolites {'H2O', 'ATP', 'C00012', 'CO2', 'Acetyl-CoA', 'C00014', 'C00025', 'C00029', 'C00076', 'Lipopolysaccharide', '6-phosphate', 'Peptide', 'peroxide', 'D-Glucose', 'Hydrogen', 'C00024', 'C00031', 'UDP-glucose', 'C00001', 'Calcium', 'Pyruvate', 'cation', 'Ammonia', 'NADPH', 'L-Glutamate', 'C00011', 'C00022', 'D-Fructose', 'C00085', 'C00005', 'C00338', 'C00027', 'C00002'}
Pathway path:mmu04714 involves genes {'mmu:17711', 'mmu:17721', 'mmu:17708', 'mmu:17719', 'mmu:17716', 'mmu:17717', 'mmu:11947'} and metabolites {'H2O', 'ATP', 'C00001', 'Calcium', 'C00025', 'cation', 'L-Glutamate', 'C00076', 'peroxide', 'D-Glucose', 'Hydrogen', 'C00027', 'C00002', 'C00031'}
Pathway path:mmu05014 involves genes {'mmu:17711', 'mmu:17717', 'mmu:17721'