<a href="https://colab.research.google.com/github/zgormez/biobank-data-extractor/blob/main/metabolites-data_from_hmdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import requests
from bs4 import BeautifulSoup

def get_metabolite_details(accession_number):
    '''
    Get the details of a metabolite from HMDB by accession number
    '''
    # Define the URL for the HMDB page
    url = f"https://hmdb.ca/metabolites/{accession_number}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve data for {accession_number}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    def extract_info(label):
        element = soup.find('th', string=label)
        if element and element.find_next_sibling('td'):
            return element.find_next_sibling('td').text.strip()
        return None

    metabolite_details = {
        "HMDB ID": accession_number,
        "Common Name": extract_info("Common Name"),
        "Chemical Formula": extract_info("Chemical Formula"),
        "IUPAC Name": extract_info("IUPAC Name"),
        "Traditional Name": extract_info("Traditional Name"),
        "CAS Registry Number": extract_info("CAS Registry Number"),
        "SMILES": extract_info("SMILES"),
        "InChI Identifier": extract_info("InChI Identifier"),
        "InChI Key": extract_info("InChI Key")
    }
    return metabolite_details



For a list of metabolites

In [34]:

import pandas as pd
''' Read  the list of HMDB ID from a txt file and exract the information'''

# Define the CSV file header
csv_header = [
        "HMDB ID", "Common Name", "Chemical Formula", "IUPAC Name",
        "Traditional Name", "CAS Registry Number", "SMILES",
        "InChI Identifier", "InChI Key"
]
# Create data frame from columns names
df = pd.DataFrame(columns=csv_header)

# get all  metabolites ID from a txt file and call the function
with open('metabolites_list.txt') as f:
    for line in f.readlines():
        metobolite_hmdb_id = line.strip()
        print(get_metabolite_details(metobolite_hmdb_id))
        df.loc[len(df)] = get_metabolite_details(metobolite_hmdb_id)

# Define the path to save the file in Google Drive
MAIN_PATH = '/content/'
OUT_FILE_PATH = MAIN_PATH+'Metabolites_details.csv'
df.to_csv(OUT_FILE_PATH, index=False)



{'HMDB ID': 'HMDB0000195', 'Common Name': 'Inosine', 'Chemical Formula': 'C10H12N4O5', 'IUPAC Name': '9-[(2R,3R,4S,5R)-3,4-dihydroxy-5-(hydroxymethyl)oxolan-2-yl]-6,9-dihydro-3H-purin-6-one', 'Traditional Name': 'inosine', 'CAS Registry Number': '58-63-9', 'SMILES': 'OC[C@H]1O[C@H]([C@H](O)[C@@H]1O)N1C=NC2=C(O)N=CN=C12', 'InChI Identifier': 'InChI=1S/C10H12N4O5/c15-1-4-6(16)7(17)10(19-4)14-3-13-5-8(14)11-2-12-9(5)18/h2-4,6-7,10,15-17H,1H2,(H,11,12,18)/t4-,6-,7-,10-/m1/s1', 'InChI Key': 'UGQMRVRMYYASKQ-KQYNXXCUSA-N'}
{'HMDB ID': 'HMDB0000001', 'Common Name': '1-Methylhistidine', 'Chemical Formula': 'C7H11N3O2', 'IUPAC Name': '2-amino-3-(1-methyl-1H-imidazol-4-yl)propanoic acid hydrate', 'Traditional Name': '4-methyl-histidine hydrate', 'CAS Registry Number': '332-80-9', 'SMILES': 'CN1C=NC(C[C@H](N)C(O)=O)=C1', 'InChI Identifier': 'InChI=1S/C7H11N3O2/c1-10-3-5(9-4-10)2-6(8)7(11)12/h3-4,6H,2,8H2,1H3,(H,11,12)/t6-/m0/s1', 'InChI Key': 'BRMWTNUJHUMWMS-LURJTMIESA-N'}
Failed to retrieve data 