Retrieve all the approved drugs from ChEMBL as well as the date of approval and the SMILES.


In [1]:
!pip install datamol
!pip install chembl_webresource_client

Collecting datamol
  Downloading datamol-0.12.5-py3-none-any.whl.metadata (8.0 kB)
Collecting loguru (from datamol)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting selfies (from datamol)
  Downloading selfies-2.1.2-py3-none-any.whl.metadata (14 kB)
Collecting rdkit (from datamol)
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading datamol-0.12.5-py3-none-any.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.4/495.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading selfies-2.1.2-py3-none-any.whl (35 kB)
Instal

In [2]:
pip install chembl_webresource_client



In [3]:
import pandas as pd
import datamol as dm

from chembl_webresource_client.new_client import new_client as client


In [4]:
# First, we retrieve the ChEMBL IDs for all the approved drugs (max_phase=4)
mol_ids = client.molecule.filter(max_phase=4).only(["molecule_chembl_id"])
mol_ids = pd.DataFrame(mol_ids)

len(mol_ids)


4384

In [5]:
# Now retrieve other columns of interest

columns = ["molecule_chembl_id", "pref_name", "first_approval", "molecule_structures", "molecule_type"]


def _get_mol(molecule_chembl_id):
    mols = client.molecule.filter(molecule_chembl_id=molecule_chembl_id).only(columns)
    assert len(mols) == 1
    mol = mols[0]

    if mol.get("molecule_structures") is not None and "canonical_smiles" in mol.get(
        "molecule_structures", []
    ):
        mol["smiles"] = mol["molecule_structures"]["canonical_smiles"]

    if "molecule_structures" in mol:
        del mol["molecule_structures"]

    return pd.Series(mol)


mols = dm.parallelized(
    _get_mol,
    mol_ids["molecule_chembl_id"],
    n_jobs=256,
    scheduler="threads",
    progress=True,
)
mols = pd.DataFrame(mols)

mols.head()


  0%|          | 0/4384 [00:00<?, ?it/s]

ERROR:requests_cache.backends.sqlite:database is locked
ERROR:requests_cache.backends.sqlite:database is locked
ERROR:requests_cache.backends.sqlite:database is locked
ERROR:requests_cache.backends.sqlite:database is locked
ERROR:requests_cache.backends.sqlite:database is locked
ERROR:requests_cache.backends.sqlite:database is locked


Unnamed: 0,first_approval,molecule_chembl_id,molecule_type,pref_name,smiles
0,1976.0,CHEMBL2,Small molecule,PRAZOSIN,COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
1,1984.0,CHEMBL3,Small molecule,NICOTINE,CN1CCC[C@H]1c1cccnc1
2,1990.0,CHEMBL4,Small molecule,OFLOXACIN,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,1964.0,CHEMBL5,Small molecule,NALIDIXIC ACID,CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
4,1965.0,CHEMBL6,Small molecule,INDOMETHACIN,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1


In [6]:
# Let's focus on small molecules with a valid SMILES and a first approval date
mols = mols.query("molecule_type == 'Small molecule' & smiles.notna() & first_approval.notna()")
mols = mols.reset_index(drop=True)

mols

Unnamed: 0,first_approval,molecule_chembl_id,molecule_type,pref_name,smiles
0,1976.0,CHEMBL2,Small molecule,PRAZOSIN,COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
1,1984.0,CHEMBL3,Small molecule,NICOTINE,CN1CCC[C@H]1c1cccnc1
2,1990.0,CHEMBL4,Small molecule,OFLOXACIN,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,1964.0,CHEMBL5,Small molecule,NALIDIXIC ACID,CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
4,1965.0,CHEMBL6,Small molecule,INDOMETHACIN,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
...,...,...,...,...,...
2771,2021.0,CHEMBL5315120,Small molecule,ODEVIXIBAT SESQUIHYDRATE,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
2772,2018.0,CHEMBL5315121,Small molecule,TECOVIRIMAT MONOHYDRATE,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...
2773,2015.0,CHEMBL5315122,Small molecule,BORTEZOMIB D-MANNITOL,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)c1cncc...
2774,1996.0,CHEMBL5315124,Small molecule,LEVOFLOXACIN,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...


In [7]:
!pip install gradio cohere

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting cohere
  Downloading cohere-5.9.4-py3-none-any.whl.metadata (3.4 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-p

In [8]:
# Save as Parquet
mols.to_parquet("/content/chembl_approved_drugs.parquet", index=False)

In [9]:
import pandas as pd

# Read the Parquet file
df = pd.read_parquet('/content/chembl_approved_drugs.parquet')

# Convert to CSV
df.to_csv('output_file.csv', index=False)

In [10]:
import pandas as pd
import datamol as dm
import gradio as gr
import cohere
from chembl_webresource_client.new_client import new_client as client
from rdkit import Chem
from rdkit.Chem import Draw

# Set up Cohere API
cohere_api_key = "OIgOnN9bV378w7IJyp68yTYH7XiGs3PpqJAKAhbI"
co = cohere.Client(cohere_api_key)

# Define the columns of interest
columns = ["molecule_chembl_id", "pref_name", "first_approval", "molecule_structures", "molecule_type"]

# Function to fetch molecule data by ChEMBL ID or SMILES
def fetch_molecule_data(chembl_id=None, smiles=None):
    print(f"Received ChEMBL ID: {chembl_id}, SMILES: {smiles}")
    mols = []
    if chembl_id:
        mols = client.molecule.filter(molecule_chembl_id=chembl_id).only(columns)
    elif smiles:
        mols = client.molecule.filter(molecule_structures__canonical_smiles=smiles).only(columns)

    print(f"Retrieved Molecule Data: {mols}")

    if len(mols) == 1:
        mol = mols[0]
        print(f"Fetched Molecule: {mol}")
        return mol.get("pref_name", "Name not found"), mol
    return "Name not found", None

# Function to query Cohere for detailed drug target information
def get_drug_target_info(chembl_id, smiles, pref_name):
    prompt = f"""
    You are an expert in drug discovery. Based on the following information:
    - ChEMBL ID: {chembl_id}
    - SMILES: {smiles}
    - Preferred Name: {pref_name}

    Provide a detailed summary of the drug's known biological targets and its mechanism of action.
    Provide approaches towards the synthesis of new compounds approaching novel drugs for multi targets, give chemical and physical properties of the drug as well.
    Provide target protein structures and formulas for the drug.
    """

    response = co.generate(
        model="command",
        prompt=prompt,
        max_tokens=300,
        temperature=0.5,
        stop_sequences=["--"]
    )
    return response.generations[0].text.strip()

# Function to generate molecular structure image
def generate_structure(smiles):
    if smiles:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            img = Draw.MolToImage(mol)
            return img
    return None

# Gradio function that integrates the molecule retrieval and Cohere integration
def gradio_interface(chembl_id, smiles):
    # Fetch molecule data
    pref_name, mol_data = fetch_molecule_data(chembl_id=chembl_id, smiles=smiles)

    if mol_data:
        # Get drug target information using Cohere
        drug_info = get_drug_target_info(chembl_id, smiles, pref_name)
        # Generate molecular structure image
        structure_img = generate_structure(smiles)
        return pref_name, drug_info, structure_img
    else:
        return "No data found", "No target information available.", None

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.Textbox(label="ChEMBL ID"), gr.Textbox(label="SMILES")],
    outputs=[gr.Textbox(label="Preferred Name"), gr.Textbox(label="Drug Target Information"), gr.Image(label="Molecular Structure")],
    title="DRUG__DISCOVER",
    description="This app retrieves the preferred name, provides detailed drug target information using the Cohere API, and shows the molecular structure based on the target compund."
)

# Launch the app
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b4d5481265c3e22a7a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


