In [7]:
# Read names of the mocules
names_dict = {}
with open("data/gutmgene/names_ids.txt", "r") as file:
    for line in file:
        elements = line.split(" ")
        id = elements[-1]
        name = "".join(elements[:-1])
        name = name.strip()
        id = id.strip()

        if name not in list(names_dict.values()):
            names_dict[id] = name    

In [8]:
len(names_dict.values())

22

In [9]:
import requests
import re
import urllib.parse

def chebi_to_smiles(name):
    """
    Busca o primeiro match no ChEBI para 'name' e tenta extrair o SMILES
    da página HTML de detalhes da molécula.

    Retorna:
        - string SMILES, se encontrado
        - None, se não encontrar ou em caso de erro
    """
    query = urllib.parse.quote(name)
    url = f"https://www.ebi.ac.uk/chebi/searchId.do?searchString={query}"

    try:
        r = requests.get(url, timeout=10)
    except requests.RequestException:
        return None

    if r.status_code != 200:
        return None

    html = r.text

    # 1) Tentativa principal: <dt>SMILES</dt><dd>...</dd>
    m = re.search(r'SMILES</dt>\s*<dd>\s*([^<]+?)\s*</dd>', html, re.IGNORECASE)
    if m:
        smiles = m.group(1).strip()
        if smiles:
            return smiles

    # 2) Fallback: data-smiles="..."
    m = re.search(r'data-smiles="([^"]+)"', html, re.IGNORECASE)
    if m:
        smiles = m.group(1).strip()
        if smiles:
            return smiles

    return None


def name_to_mol(name):
    """
    Tenta converter nome → SMILES usando OPSIN.
    """
    query = urllib.parse.quote(name)
    url = f"https://opsin.ch.cam.ac.uk/opsin/{query}.smi"
    try:
        response = requests.get(url, timeout=10)
    except requests.RequestException:
        return None

    if response.status_code == 200:
        smiles = response.text.strip()
        return smiles if smiles else None
    else:
        return None


def cactus_name_to_smiles(name):
    """
    Tenta converter nome → SMILES usando o servidor CACTUS (NIH).
    """
    query = urllib.parse.quote(name)
    url = f"https://cactus.nci.nih.gov/chemical/structure/{query}/smiles"
    try:
        r = requests.get(url, timeout=10)
    except requests.RequestException:
        return None

    if r.status_code == 200:
        smiles = r.text.strip()
        return smiles if smiles else None
    return None


def get_smiles(name):
    """
    Tenta, em ordem:
    1) OPSIN
    2) ChEBI
    3) CACTUS

    Retorna o primeiro SMILES válido encontrado ou None.
    """
    for func in (name_to_mol, chebi_to_smiles, cactus_name_to_smiles):
        smiles = func(name)
        if smiles is not None:
            return smiles
        #else:
            #print(name)
    return None


# names_dict deve ser um dict {id: nome_da_molécula}
smiles_dict = {}
for mol_id, name in names_dict.items():
    smiles = get_smiles(name)
    if smiles is not None:
        smiles_dict[mol_id] = smiles




In [None]:
len(smiles_dict.keys()) 

9

In [13]:
# Append SMILES and IDs to the file previous generated by the R code
with open("data/gutmgene/smiles_nocid.txt", "a") as file:
    for id, smiles in smiles_dict.items():
        file.write(f"{smiles} {id}\n")