In [None]:
# Copyright (C) Tahoe Therapeutics 2025. All rights reserved.
import copy
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import ConvertToNumpyArray
from tqdm.auto import tqdm


In [2]:
def generate_fingerprints(smiles_list: list, fingerprint_type: str = "morgan", radius: int = 2) -> np.ndarray:
    """Generate fingerprints for a list of SMILES strings."""
    print(f"Generating {fingerprint_type} fingerprints for {len(smiles_list)} SMILES strings.")
    fingerprints = []
    for smiles in tqdm(smiles_list, desc="Generating fingerprints"):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            fingerprints.append(None)
            continue
        if fingerprint_type == "morgan":
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius)
        else:
            raise ValueError(f"Unknown fingerprint type: {fingerprint_type}")
        fingerprints.append(np.array(fp.ToList()))
    print("Fingerprint generation complete.")
    return np.array(fingerprints)

In [None]:
def generate_fingerprints_np(smiles_list: list, radius: int = 2, nBits: int = 2048) -> np.ndarray:
    """Generate fingerprints for a list of SMILES strings using a NumPy array allocation."""
    print(f"Generating Morgan fingerprints for {len(smiles_list)} SMILES strings.")
    fingerprints = np.zeros((len(smiles_list), nBits), dtype=int)
    for i, smiles in enumerate(tqdm(smiles_list, desc="Generating fingerprints")):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            # Here you might want to set a row of NaNs or zeros to indicate failure
            fingerprints[i, :] = np.nan
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        ConvertToNumpyArray(fp, fingerprints[i])
    print("Fingerprint generation complete.")
    return fingerprints

In [4]:
from datasets import load_dataset

drug_metadata =  load_dataset("vevotx/Tahoe-100M","drug_metadata", split="train")

In [5]:
drug_metadata

Dataset({
    features: ['drug', 'targets', 'moa-broad', 'moa-fine', 'human-approved', 'clinical-trials', 'gpt-notes-approval', 'canonical_smiles', 'pubchem_cid'],
    num_rows: 379
})

In [6]:
drugs = drug_metadata["drug"] #None in two places, 379 unique
cid = drug_metadata["pubchem_cid"] #None in two places, 377 unique
smiles = drug_metadata["canonical_smiles"] #identical in two places and none in two places

In [7]:
drug_to_id = {"<pad>": 0}
chosen_smiles = []
i = 1

for drug, smile in zip(drugs, smiles):
    if smile!=None:
        drug_to_id[drug]=i
        i += 1
        chosen_smiles.append(smile)


print(len(drug_to_id), drug_to_id)
print(len(chosen_smiles), chosen_smiles)

378 {'<pad>': 0, 'Talc': 1, 'Bortezomib': 2, 'Ixazomib': 3, 'Ixazomib citrate': 4, 'Lactate (calcium)': 5, 'Bisoprolol (hemifumarate)': 6, 'Fumaric acid': 7, 'Hydroxyurea': 8, 'L-Eflornithine (monohydrochloride)': 9, 'Cysteamine (hydrochloride)': 10, 'Darinaparsin': 11, 'Entecavir (monohydrate)': 12, 'Allantoin': 13, '5-Fluorouracil': 14, 'L-Thyroxine (sodium salt pentahydrate)': 15, 'Gallic acid': 16, 'Gallic acid (hydrate)': 17, 'ERK5-IN-2': 18, 'Vilanterol': 19, 'Niclosamide (olamine)': 20, 'Norepinephrine (hydrochloride)': 21, 'Triclosan': 22, 'Mitoxantrone (dihydrochloride)': 23, 'Pentamidine (isethionate)': 24, 'Folic acid': 25, 'Balsalazide (sodium hydrate)': 26, 'Resveratrol': 27, 'PF-06260933': 28, 'Daidzin': 29, 'Pemetrexed': 30, 'Econazole': 31, 'XRK3F2': 32, 'Arbutin': 33, 'Tucidinostat': 34, 'Pexidartinib (hydrochloride)': 35, 'Sodium Salicylate': 36, 'Salicylic acid': 37, 'Ataluren': 38, '4EGI-1': 39, 'Clotrimazole': 40, 'Phenytoin (sodium)': 41, 'SBI-0640756': 42, 'Oxapr

In [10]:
import json
with open("drug_to_id.json", "w") as f:
    json.dump(drug_to_id, f)

In [None]:
fps = fps_np = np.zeros((len(chosen_smiles)+1, 2048))
fps[1:, :] = generate_fingerprints(chosen_smiles, "morgan", 2)
fps_np[1:, :] = generate_fingerprints_np(chosen_smiles)
fps.shape, fps_np.shape, fps, fps_np

Generating morgan fingerprints for 377 SMILES strings.


Generating fingerprints:  37%|███▋      | 139/377 [00:17<00:29,  7.96it/s]ase use MorganGenerator


KeyboardInterrupt: 

: 

In [10]:
np.save("drug_fps", fps_np)
fps_np.shape

(378, 2048)