In [None]:
meta_TCM = pd.read_excel(
    '/Users/hornung_comp1/Downloads/TCM_Compounds_HEK293T_10_MetaData.xlsx', header=1
)

comp_TCM = pd.read_excel('/Users/hornung_comp1/Downloads/41592_2025_2781_MOESM3_ESM.xlsx',
                   sheet_name='Supplementary Table 4', header=1)

comp_TCM['Treat'] = comp_TCM['Catalog Number'].str.replace('Cpd', 'Compd', regex=False)

import pandas as pd
import pubchempy as pcp
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# ------------------------------------------------
# 0. Load comp_TCM (adjust path / sheet as needed)
# ------------------------------------------------
# comp_TCM = pd.read_excel("TCM_compound_table.xlsx")

# Make sure required columns exist
required_cols = ['Catalog Number.1', 'Compound name']
missing_cols = [c for c in required_cols if c not in comp_TCM.columns]
if missing_cols:
    raise ValueError(f"comp_TCM is missing required columns: {missing_cols}")

# Ensure 'smiles' column exists
if 'smiles' not in comp_TCM.columns:
    comp_TCM['smiles'] = None


# ------------------------------------------------
# 1. Single-compound lookup with retries
# ------------------------------------------------
def lookup_smiles(cas, name, max_retries=5, sleep_sec=0.5):
    """
    Try to get SMILES using:
    1) CAS (Catalog Number.1)
    2) Compound name

    with a few retries for PubChem flakiness.
    """
    queries = []

    if pd.notna(cas):
        cas_str = str(cas).strip()
        if cas_str:
            queries.append(cas_str)

    if pd.notna(name):
        name_str = str(name).strip()
        if name_str:
            queries.append(name_str)

    if not queries:
        return None

    for attempt in range(max_retries):
        for q in queries:
            try:
                compounds = pcp.get_compounds(q, 'name')
                if compounds:
                    c = compounds[0]
                    smiles = (
                        c.connectivity_smiles
                        or c.isomeric_smiles
                        or c.canonical_smiles
                    )
                    return smiles
            except Exception:
                # ignore and retry
                pass
        # if nothing worked this attempt, wait a bit and retry
        time.sleep(sleep_sec)

    return None


# ------------------------------------------------
# 2. Build unique CAS/name keys for missing SMILES
# ------------------------------------------------
missing_df = comp_TCM[comp_TCM['smiles'].isna()].copy()

keys = (
    missing_df[['Catalog Number.1', 'Compound name']]
    .drop_duplicates()
    .reset_index(drop=True)
)

print(f"Unique CAS/name pairs to query: {len(keys)}")


# ------------------------------------------------
# 3. Parallel PubChem querying
# ------------------------------------------------
def worker(row):
    cas = row['Catalog Number.1']
    name = row['Compound name']
    smiles = lookup_smiles(cas, name)
    return cas, name, smiles


results = []

# tune max_workers depending on your connection / conscience; 4–8 is usually okay
MAX_WORKERS = 6

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {
        executor.submit(worker, row): idx
        for idx, row in keys.iterrows()
    }

    for fut in as_completed(futures):
        cas, name, smiles = fut.result()
        results.append((cas, name, smiles))

# Convert results to DataFrame
map_df = pd.DataFrame(results, columns=['Catalog Number.1', 'Compound name', 'smiles'])


# ------------------------------------------------
# 4. Merge SMILES back into comp_TCM
# ------------------------------------------------
comp_TCM = comp_TCM.merge(
    map_df,
    on=['Catalog Number.1', 'Compound name'],
    how='left',
    suffixes=('', '_new'),
)

# Fill only where original 'smiles' is NaN and new is not
mask = comp_TCM['smiles'].isna() & comp_TCM['smiles_new'].notna()
comp_TCM.loc[mask, 'smiles'] = comp_TCM.loc[mask, 'smiles_new']

comp_TCM = comp_TCM.drop(columns=['smiles_new'])

num_missing_final = comp_TCM['smiles'].isna().sum()
print(f"Done. Remaining rows without SMILES: {num_missing_final}")


# ------------------------------------------------
# 5. Save updated comp_TCM
# ------------------------------------------------
out_path = "comp_TCM_with_smiles_pubchem_parallel_5.csv"
comp_TCM.to_csv(out_path, index=False)
print(f"Saved updated comp_TCM to: {out_path}")


In [None]:
import pandas as pd
import pubchempy as pcp
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# ------------------------------------------------
# 0. Load MCE tables
# ------------------------------------------------
comp_MCE = pd.read_excel(
    '/Users/hornung_comp1/Downloads/41592_2025_2781_MOESM3_ESM.xlsx',
    sheet_name='Supplementary Table 3',
    header=1
)

meta_MCE = pd.read_excel(
    '/Users/hornung_comp1/Downloads/MCE_Bioactive_Compounds_HEK293T_10μM_MetaData.xlsx',
    header=1
)

# Make sure required columns exist in comp_MCE
required_cols = ['CAS Number', 'Compound name']
missing_cols = [c for c in required_cols if c not in comp_MCE.columns]
if missing_cols:
    raise ValueError(f"comp_MCE is missing required columns: {missing_cols}")

# Ensure 'smiles' column exists
if 'smiles' not in comp_MCE.columns:
    comp_MCE['smiles'] = None


# ------------------------------------------------
# 1. Single-compound lookup with retries
# ------------------------------------------------
def lookup_smiles(cas, name, max_retries=4, sleep_sec=0.5):
    """
    Try to get SMILES using:
    1) CAS Number
    2) Compound name

    with a few retries for PubChem flakiness.
    """
    queries = []

    if pd.notna(cas):
        cas_str = str(cas).strip()
        if cas_str:
            queries.append(cas_str)

    if pd.notna(name):
        name_str = str(name).strip()
        if name_str:
            queries.append(name_str)

    if not queries:
        return None

    for attempt in range(max_retries):
        for q in queries:
            try:
                compounds = pcp.get_compounds(q, 'name')
                if compounds:
                    c = compounds[0]
                    smiles = (
                        c.connectivity_smiles
                        or c.isomeric_smiles
                        or c.canonical_smiles
                    )
                    return smiles
            except Exception:
                # ignore and retry
                pass
        # if nothing worked this attempt, wait a bit and retry
        time.sleep(sleep_sec)

    return None


# ------------------------------------------------
# 2. Build unique CAS/name keys for missing SMILES
# ------------------------------------------------
missing_df = comp_MCE[comp_MCE['smiles'].isna()].copy()

keys = (
    missing_df[['CAS Number', 'Compound name']]
    .drop_duplicates()
    .reset_index(drop=True)
)

print(f"Unique CAS/name pairs to query: {len(keys)}")


# ------------------------------------------------
# 3. Parallel PubChem querying
# ------------------------------------------------
def worker(row):
    cas = row['CAS Number']
    name = row['Compound name']
    smiles = lookup_smiles(cas, name)
    return cas, name, smiles


results = []

# tune max_workers depending on your connection; 4–8 is usually okay
MAX_WORKERS = 6

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {
        executor.submit(worker, row): idx
        for idx, row in keys.iterrows()
    }

    for fut in as_completed(futures):
        cas, name, smiles = fut.result()
        results.append((cas, name, smiles))

# Convert results to DataFrame
map_df = pd.DataFrame(results, columns=['CAS Number', 'Compound name', 'smiles'])


# ------------------------------------------------
# 4. Merge SMILES back into comp_MCE
# ------------------------------------------------
comp_MCE = comp_MCE.merge(
    map_df,
    on=['CAS Number', 'Compound name'],
    how='left',
    suffixes=('', '_new'),
)

# Fill only where original 'smiles' is NaN and new is not
mask = comp_MCE['smiles'].isna() & comp_MCE['smiles_new'].notna()
comp_MCE.loc[mask, 'smiles'] = comp_MCE.loc[mask, 'smiles_new']

comp_MCE = comp_MCE.drop(columns=['smiles_new'])

num_missing_final = comp_MCE['smiles'].isna().sum()
print(f"Done. Remaining rows without SMILES: {num_missing_final}")


# ------------------------------------------------
# 5. (Optional) merge into meta_MCE using Catalog Number
# ------------------------------------------------
meta_MCE_with_smiles = meta_MCE.merge(
    comp_MCE[['Catalog Number', 'Compound name', 'CAS Number', 'smiles']],
    left_on='treatment',
    right_on='Catalog Number',
    how='left'
)

print("\nmeta_MCE with SMILES joined:")
print(meta_MCE_with_smiles.head())


# ------------------------------------------------
# 6. Save updated tables
# ------------------------------------------------
out_comp_path = "comp_MCE_with_smiles_pubchem_parallel.csv"
comp_MCE.to_csv(out_comp_path, index=False)
print(f"Saved updated comp_MCE to: {out_comp_path}")

out_meta_path = "meta_MCE_with_smiles_pubchem_parallel.csv"
meta_MCE_with_smiles.to_csv(out_meta_path, index=False)
print(f"Saved updated meta_MCE to: {out_meta_path}")
