In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import pyarrow.dataset as ds
import gcsfs
import requests
from io import StringIO
import re
import time
import urllib.parse

In [2]:
fs = gcsfs.GCSFileSystem()

In [3]:
gcp_base_path = "gs://arc-ctc-tahoe100/2025-02-25/"

In [4]:
files = fs.ls(gcp_base_path)
print("Available files:")
for file in files:
    print(f"  {file}")

Available files:
  arc-ctc-tahoe100/2025-02-25/
  arc-ctc-tahoe100/2025-02-25/README.md
  arc-ctc-tahoe100/2025-02-25/h5ad
  arc-ctc-tahoe100/2025-02-25/metadata
  arc-ctc-tahoe100/2025-02-25/tutorial


In [5]:
infile = "/".join([gcp_base_path.rstrip("/"), 'metadata', 'obs_metadata.parquet'])
infile

'gs://arc-ctc-tahoe100/2025-02-25/metadata/obs_metadata.parquet'

In [6]:
# read a subset of the metadata
# Chain it all together
drugs = set(ds.dataset(infile, filesystem=fs, format="parquet").to_table(columns=['drug']).to_pandas()['drug'].unique())

In [7]:
def clean_names(drug):
    replacements = {'α': 'alpha', 'β': 'beta', 'γ': 'gamma', 'δ': 'delta', 'ω': 'omega'}
    for symbol, spelled in replacements.items():
       if drug.startswith(symbol):
           drug = spelled + drug[1:]
       else:
           drug = drug.replace(symbol, f'-{spelled}')

    drug = drug.replace('/', ' ')
    return drug

cleaned_drugs = {clean_names(drug) for drug in drugs}

In [8]:
cleaned_drugs

{'(R)-Verapamil (hydrochloride)',
 '(S)-Crizotinib',
 '18-beta-Glycyrrhetinic acid',
 '4EGI-1',
 '5-Azacytidine',
 '5-Fluorouracil',
 '8-Hydroxyquinoline',
 '9-ING-41',
 'APTO-253',
 'AT7519',
 'AZD-7648',
 'AZD-8055',
 'AZD1390',
 'AZD2858',
 'Abemaciclib',
 'Abiraterone acetate',
 'Acetazolamide',
 'Acetohexamide',
 'Adagrasib',
 'Adenine',
 'Adenosine',
 'Afatinib',
 'Aliskiren',
 'Allantoin',
 'Allopurinol',
 'Almonertinib (hydrochloride)',
 'Almonertinib (mesylate)',
 'Alpelisib',
 'Altretamine',
 'Amsacrine',
 'Anastrozole',
 'Anethole trithione',
 'Apalutamide',
 'Aprepitant',
 'Arbutin',
 'Artemether',
 'Artesunate',
 'Asciminib',
 'Aspirin',
 'Ataluren',
 'Atazanavir (sulfate)',
 'Auranofin',
 'Azithromycin (hydrate)',
 'BAY1125976',
 'BI-3406',
 'BI-78D3',
 'Baicalin',
 'Balsalazide (sodium hydrate)',
 'Belinostat',
 'Belumosudil',
 'Belumosudil (mesylate)',
 'Belzutifan',
 'Bendamustine',
 'Benproperine (phosphate)',
 'Bentamapimod',
 'Benztropine (mesylate)',
 'Berbamine',


In [9]:
import requests
import time

drug_to_pubchem = {}

for drug in cleaned_drugs:
    smiles_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug}/property/CanonicalSMILES/TXT'
    cid_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug}/cids/TXT'

    result = {"smiles": None, "cid": None}

    # Get SMILES
    smiles_response = requests.get(smiles_url)
    if smiles_response.status_code == 200:
        result["smiles"] = smiles_response.text.strip().splitlines()[0]
    elif smiles_response.status_code == 404:
        result["smiles"] = "not found"
    elif smiles_response.status_code == 503:
        for retry in range(1, 4):
            time.sleep(0.5 * retry)
            retry_response = requests.get(smiles_url)
            if retry_response.status_code == 200:
                result["smiles"] = retry_response.text.strip().splitlines()[0]
                break
        else:
            result["smiles"] = "503 service unavailable"
    else:
        result["smiles"] = f'error: {smiles_response.status_code}'

    time.sleep(0.1)

    # Get CID
    cid_response = requests.get(cid_url)
    if cid_response.status_code == 200:
        result["cid"] = cid_response.text.strip().splitlines()[0]
    elif cid_response.status_code == 404:
        result["cid"] = "not found"
    elif cid_response.status_code == 503:
        for retry in range(1, 4):
            time.sleep(0.5 * retry)
            retry_response = requests.get(cid_url)
            if retry_response.status_code == 200:
                result["cid"] = retry_response.text.strip().splitlines()[0]
                break
        else:
            result["cid"] = "503 service unavailable"
    else:
        result["cid"] = f'error: {cid_response.status_code}'

    drug_to_pubchem[drug] = result

    time.sleep(0.1)


In [17]:
for drug, result in drug_to_pubchem.items():
    print(f"{drug} → {result['smiles']}")

Minodronic acid → C1=CC2=NC=C(N2C=C1)CC(O)(P(=O)(O)O)P(=O)(O)O
Berbamine (dihydrochloride) → CN1CCC2=CC(=C3C=C2C1CC4=CC=C(C=C4)OC5=C(C=CC(=C5)CC6C7=C(O3)C(=C(C=C7CCN6C)OC)OC)O)OC.Cl.Cl
Indacaterol (maleate) → CCC1=C(C=C2CC(CC2=C1)NCC(C3=C4C=CC(=O)NC4=C(C=C3)O)O)CC.C(=CC(=O)O)C(=O)O
Furosemide → C1=COC(=C1)CNC2=CC(=C(C=C2C(=O)O)S(=O)(=O)N)Cl
Tofacitinib (citrate) → CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N.C(C(=O)O)C(CC(=O)O)(C(=O)O)O
Bergenin → COC1=C(C=C2C(=C1O)C3C(C(C(C(O3)CO)O)O)OC2=O)O
Ligustrazine → CC1=C(N=C(C(=N1)C)C)C
Elagolix sodium → CC1=C(C(=O)N(C(=O)N1CC2=C(C=CC=C2F)C(F)(F)F)CC(C3=CC=CC=C3)NCCCC(=O)[O-])C4=C(C(=CC=C4)OC)F.[Na+]
Imiquimod (hydrochloride) → CC(C)CN1C=NC2=C1C3=CC=CC=C3N=C2N.Cl
Sinomenine → CN1CCC23CC(=O)C(=CC2C1CC4=C3C(=C(C=C4)OC)O)OC
Dapagliflozin → CCOC1=CC=C(C=C1)CC2=C(C=CC(=C2)C3C(C(C(C(O3)CO)O)O)O)Cl
Clonidine (hydrochloride) → C1CN=C(N1)NC2=C(C=CC=C2Cl)Cl.Cl
Gallic acid (hydrate) → C1=C(C=C(C(=C1O)O)O)C(=O)O.O
Baicalin → C1=CC=C(C=C1)C2=CC(=O)C3=C(C(=C(C