<a href="https://colab.research.google.com/github/zgormez/biobank-data-extractor/blob/main/metabolites_data_from_hmdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook created for [biocrates life sciences ag](https://biocrates.com/)

Dr. Zeliha Cetin

10.10.2024

Define a function to exract data from biobank as an html page and parse it

In [None]:
import requests
from bs4 import BeautifulSoup

def get_metabolite_details(accession_number):
    '''
    Get the details of a metabolite from HMDB by accession number
    '''
    # Define the URL for the HMDB page
    url = f"https://hmdb.ca/metabolites/{accession_number}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve data for {accession_number}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    def extract_info(label):
        element = soup.find('th', string=label)
        if element and element.find_next_sibling('td'):
            return element.find_next_sibling('td').text.strip()
        return None

    metabolite_details = {
        "HMDB ID": accession_number,
        "Common Name": extract_info("Common Name"),
        "Chemical Formula": extract_info("Chemical Formula"),
        "IUPAC Name": extract_info("IUPAC Name"),
        "Traditional Name": extract_info("Traditional Name"),
        "CAS Registry Number": extract_info("CAS Registry Number"),
        "SMILES": extract_info("SMILES"),
        "InChI Identifier": extract_info("InChI Identifier"),
        "InChI Key": extract_info("InChI Key")
    }
    return metabolite_details



Get a list of metabolites and extract data for all metabolites

In [None]:
import pandas as pd
from google.colab import drive

''' Read  the list of HMDB ID from a txt file and exract the information'''

# Mount Google Drive
drive.mount('/content/drive')
# Define the path to save the file in Google Drive
MAIN_PATH = '/content/drive/MyDrive/biocrates/'
OUT_FILE_PATH = MAIN_PATH+'Metabolites_details.csv'
IN_FILE_PATH = MAIN_PATH+'metabolites_list.txt'

'''
Example content of input file containing HMDB IDs in each line:
HMDB0000195
HMDB0000001
HMDB0000002
HMDB0000003
HMDB0000004
'''
# Define the CSV file header
csv_header = [
        "HMDB ID", "Common Name", "Chemical Formula", "IUPAC Name",
        "Traditional Name", "CAS Registry Number", "SMILES",
        "InChI Identifier", "InChI Key"
]
# Create data frame from columns names
df = pd.DataFrame(columns=csv_header)

# get all  metabolites ID from a txt file and call the function
num_found = 0
num_not_found = 0
with open(IN_FILE_PATH) as f:
    for line in f.readlines():
        metobolite_hmdb_id = line.strip()
        details=get_metabolite_details(metobolite_hmdb_id)
        if details:
            num_found += 1
            df.loc[len(df)] = metobolite_hmdb_id
            print(f"Metabolite ( {metobolite_hmdb_id}) details have been extracted")
        else:
            num_not_found += 1
            print(f"No details found for HMDB ID: {metobolite_hmdb_id}")


df.to_csv(OUT_FILE_PATH, index=False)

print(f"\nData saved to {OUT_FILE_PATH}")
print(f"Number of metabolites found: {num_found}")
print(f"Number of metabolites not found: {num_not_found}")


Mounted at /content/drive
Metabolite ( HMDB0000195) details have been extracted
Metabolite ( HMDB0000001) details have been extracted
Metabolite ( HMDB0000002) details have been extracted
Failed to retrieve data for HMDB0000003: 404
No details found for HMDB ID: HMDB0000003
Failed to retrieve data for HMDB0000004: 404
No details found for HMDB ID: HMDB0000004

Data saved to /content/drive/MyDrive/biocrates/Metabolites_details.csv
Number of metabolites found: 3
Number of metabolites not found: 2
