In [10]:
import pandas as pd
comp_MCE = pd.read_excel('/Users/hornung_comp1/Downloads/41592_2025_2781_MOESM3_ESM.xlsx',
                   sheet_name='Supplementary Table 3', header=1)


meta_MCE = pd.read_excel(
    '/Users/hornung_comp1/Downloads/MCE_Bioactive_Compounds_HEK293T_10μM_MetaData.xlsx', header=1
)



In [6]:
import pandas as pd
import pubchempy as pcp


# 1. Function: CAS/Name → SMILES via PubChem

def row_to_smiles(row):
    """
    Try to get SMILES for a compound using:
    1) CAS Number
    2) Compound name

    Returns SMILES string or None.
    """
    cas = row['CAS Number']
    name = row['Compound name']

    # Candidates to try in order
    queries = []

    if pd.notna(cas):
        cas_str = str(cas).strip()
        if cas_str:
            queries.append(cas_str)

    if pd.notna(name):
        name_str = str(name).strip()
        if name_str:
            queries.append(name_str)

    for q in queries:
        try:
            # PubChem treats CAS and names both as "name" queries
            compounds = pcp.get_compounds(q, 'name')
            if compounds:
                c = compounds[0]
                #return c.canonical_smiles # or connectivity_smiles
                return c.connectivity_smiles
        except Exception:
            pass  # just try the next query

    return None


# 2. Add SMILES column to comp_MCE (CAS table)

comp_MCE['smiles'] = comp_MCE.apply(row_to_smiles, axis=1)


# 3. Merge SMILES into meta_MCE via Catalog Number match
#     meta_MCE['treatment']     == comp_MCE['Catalog Number']

meta_MCE_with_smiles = meta_MCE.merge(
    comp_MCE[['Catalog Number', 'Compound name', 'CAS Number', 'smiles']],
    left_on='treatment',
    right_on='Catalog Number',
    how='left'
)

print("meta_MCE with SMILES joined:")
meta_MCE_with_smiles.head()

# ------------------------------------
# 4. Optional: save to file
# ------------------------------------
# meta_MCE_with_smiles.to_csv("meta_MCE_with_smiles_pubchem.csv", index=False)


comp_MCE with SMILES (PubChem):


Unnamed: 0,Compound name,Catalog Number,CAS Number,MOA,Clinical Information,Approved Type,smiles
0,Pirozadil,HY_100144,54110-25-7,Others,No Development Reported,,COC1=CC(=CC(=C1OC)OC)C(=O)OCC2=NC(=CC=C2)COC(=...
1,NKL 22,HY_100384,537034-15-4,HDAC,No Development Reported,,C1=CC=C(C=C1)NC(=O)CCCCCC(=O)NC2=CC=CC=C2N
2,Toll-like receptor modulator,HY_10018,926927-42-6,Toll-like Receptor (TLR),No Development Reported,,CCOC(=O)C1=CC2=C(C=C(C=C2)C(C(F)(F)F)(F)F)N=C(...
3,Lu AF21934,HY_100366,1445605-23-1,mGluR,No Development Reported,,C1CCC(C(C1)C(=O)N)C(=O)NC2=CC(=C(C=C2)Cl)Cl
4,Vonoprazan,HY_100007,881681-00-1,Proton Pump,Launched,FDA; Other Countries,CNCC1=CN(C(=C1)C2=CC=CC=C2F)S(=O)(=O)C3=CN=CC=C3


meta_MCE with SMILES joined:


Unnamed: 0,unique_ID,experiment_no,sample_plate,sample_row,sample_column,cell_id,pert_itime,pert_idose,sample,treatment,Catalog Number,Compound name,CAS Number,smiles
0,MCE5_293T_24H_X1:D10,MCE_20220606,MCE5_293T_24H_X1,4,10,cell_293T,24 h,10.0 uM,HY_50946,HY_50946,,,,
1,MCE5_293T_24H_X1:G14,MCE_20220606,MCE5_293T_24H_X1,7,14,cell_293T,24 h,10.0 uM,HY_18686,HY_18686,,,,
2,MCE5_293T_24H_X1:G22,MCE_20220606,MCE5_293T_24H_X1,7,22,cell_293T,24 h,10.0 uM,HY_17592A,HY_17592A,,,,
3,MCE5_293T_24H_X1:O22,MCE_20220606,MCE5_293T_24H_X1,15,22,cell_293T,24 h,10.0 uM,HY_19411,HY_19411,,,,
4,MCE5_293T_24H_X1:G08,MCE_20220606,MCE5_293T_24H_X1,7,8,cell_293T,24 h,10.0 uM,HY_18522,HY_18522,,,,


In [14]:
meta_TCM = pd.read_excel(
    '/Users/hornung_comp1/Downloads/TCM_Compounds_HEK293T_10_MetaData.xlsx', nrows=10, header=1
)

comp_TCM = pd.read_excel('/Users/hornung_comp1/Downloads/41592_2025_2781_MOESM3_ESM.xlsx',
                   sheet_name='Supplementary Table 4', nrows=10, header=1)

comp_TCM['Treat'] = comp_TCM['Catalog Number'].str.replace('Cpd', 'Compd', regex=False)

In [15]:
import pandas as pd
import pubchempy as pcp

# 1. Function: CAS/Name → SMILES via PubChem, using TCM columns
def row_to_smiles(row):
    """
    Try to get SMILES for a compound using:
    1) CAS (Catalog Number.1)
    2) Compound name

    Returns SMILES string or None.
    """
    cas = row['Catalog Number.1']      # CAS, e.g. 61276-17-3
    name = row['Compound name']        # human name, e.g. Acteoside

    queries = []

    if pd.notna(cas):
        cas_str = str(cas).strip()
        if cas_str:
            queries.append(cas_str)

    if pd.notna(name):
        name_str = str(name).strip()
        if name_str:
            queries.append(name_str)

    for q in queries:
        try:
            compounds = pcp.get_compounds(q, 'name')  # works for CAS or names
            if compounds:
                c = compounds[0]
                return c.connectivity_smiles   # recommended field
        except Exception:
            pass

    return None


# 2. Add SMILES column to comp_TCM (using CAS + name)
comp_TCM['smiles'] = comp_TCM.apply(row_to_smiles, axis=1)


# 3. Merge SMILES into meta_TCM via Treat match
#    meta_TCM['Treat']  ==  comp_TCM['Treat']
meta_TCM_with_smiles = meta_TCM.merge(
    comp_TCM[['Treat', 'Compound name', 'Catalog Number', 'Catalog Number.1', 'smiles']],
    on='Treat',
    how='left'
)

print("meta_TCM with SMILES joined:")
meta_TCM_with_smiles.head()

# 4. Optional: save
# meta_TCM_with_smiles.to_csv("meta_TCM_with_smiles_pubchem.csv", index=False)


comp_TCM with SMILES (PubChem):


Unnamed: 0,Compound name,Catalog Number,Catalog Number.1,Treat,smiles
0,Acteoside,Cpd0001,61276-17-3,Compd0001,CC1C(C(C(C(O1)OC2C(C(OC(C2OC(=O)C=CC3=CC(=C(C=...
1,Asiaticoside B,Cpd0002,125265-68-1,Compd0002,CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...
2,Brandioside,Cpd0003,133393-81-4,Compd0003,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OCCC3=CC(=C(C=C3)...
3,Clematichinenos\nide AR,Cpd0004,761425-93-8,Compd0004,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3C(OC(C(C3O)O)O...
4,Saikosaponin\nB1,Cpd0005,58558-08-0,Compd0005,CC1C(C(C(C(O1)OC2CCC3(C(C2(C)CO)CCC4(C3C=CC5=C...


meta_TCM with SMILES joined:


Unnamed: 0,Sample_unique_id,Treat,Plate,library,Cell,Dose,Time,Compound name,Catalog Number,Catalog Number.1,smiles
0,sample1_1,Compd0001,Plate1,1,293T,10uM,24h,Acteoside,Cpd0001,61276-17-3,CC1C(C(C(C(O1)OC2C(C(OC(C2OC(=O)C=CC3=CC(=C(C=...
1,sample1_10,Compd0010,Plate1,1,293T,10uM,24h,Ginsenoside Re,Cpd0010,52286-59-6,CC1C(C(C(C(O1)OC2C(C(C(OC2OC3CC4(C(CC(C5C4(CCC...
2,sample1_100,Compd0100,Plate1,1,293T,10uM,24h,,,,
3,sample1_1000,Compd0272,Plate3,1,293T,10uM,24h,,,,
4,sample1_1001,Compd0273,Plate3,1,293T,10uM,24h,,,,


## inverted 

In [11]:
import pandas as pd
import pubchempy as pcp

# 1. Function: CAS/Name → SMILES via PubChem, using columns from the merged meta+comp row
def row_to_smiles(row):
    """
    Try to get SMILES for a compound using:
    1) CAS Number (from comp_MCE matched by treatment)
    2) Compound name (fallback)

    Returns SMILES string or None.
    """
    cas = row['CAS Number']
    name = row['Compound name']

    queries = []

    if pd.notna(cas):
        cas_str = str(cas).strip()
        if cas_str:
            queries.append(cas_str)

    if pd.notna(name):
        name_str = str(name).strip()
        if name_str:
            queries.append(name_str)

    for q in queries:
        try:
            compounds = pcp.get_compounds(q, 'name')  # works for CAS or names
            if compounds:
                c = compounds[0]
                return c.connectivity_smiles   # PubChemPy-recommended field
        except Exception:
            pass

    return None


# 2. First: merge meta_MCE with comp_MCE to attach CAS + name per treatment
#    meta_MCE['treatment']  ==  comp_MCE['Catalog Number']
meta_with_info = meta_MCE.merge(
    comp_MCE[['Catalog Number', 'Compound name', 'CAS Number']],
    left_on='treatment',
    right_on='Catalog Number',
    how='left'
)

# 3. Now, for EACH meta row (with its CAS + name), get SMILES
meta_with_info['smiles'] = meta_with_info.apply(row_to_smiles, axis=1)

# 4a. If you want meta + compound info + smiles:
meta_MCE_with_smiles = meta_with_info  # has all meta cols + Catalog Number + Compound name + CAS Number + smiles

# 4b. If you want ONLY meta columns + smiles (no extra columns), use:
# meta_MCE_with_smiles = meta_with_info[meta_MCE.columns.tolist() + ['smiles']]

print("meta_MCE with SMILES joined:")
display(meta_MCE_with_smiles.head())


meta_MCE with SMILES joined:


Unnamed: 0,unique_ID,experiment_no,sample_plate,sample_row,sample_column,cell_id,pert_itime,pert_idose,sample,treatment,Catalog Number,Compound name,CAS Number,smiles
0,MCE5_293T_24H_X1:D10,MCE_20220606,MCE5_293T_24H_X1,4,10,cell_293T,24 h,10.0 uM,HY_50946,HY_50946,HY_50946,Imatinib (Mesylate),220127-57-1,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...
1,MCE5_293T_24H_X1:G14,MCE_20220606,MCE5_293T_24H_X1,7,14,cell_293T,24 h,10.0 uM,HY_18686,HY_18686,HY_18686,AS1949490,1203680-76-5,CC(C1=CC=CC=C1)NC(=O)C2=C(C=CS2)OCC3=CC=C(C=C3)Cl
2,MCE5_293T_24H_X1:G22,MCE_20220606,MCE5_293T_24H_X1,7,22,cell_293T,24 h,10.0 uM,HY_17592A,HY_17592A,HY_17592A,Bithionol (sulfoxide),844-26-8,C1=C(C=C(C(=C1S(=O)C2=C(C(=CC(=C2)Cl)Cl)O)O)Cl)Cl
3,MCE5_293T_24H_X1:O22,MCE_20220606,MCE5_293T_24H_X1,15,22,cell_293T,24 h,10.0 uM,HY_19411,HY_19411,HY_19411,SSR180711 (hydrochloride),446031-79-4,C1CN2CCC1N(CC2)C(=O)OC3=CC=C(C=C3)Br.Cl
4,MCE5_293T_24H_X1:G08,MCE_20220606,MCE5_293T_24H_X1,7,8,cell_293T,24 h,10.0 uM,HY_18522,HY_18522,HY_18522,AA26-9,1312782-34-5,C1CCN(C1)C(=O)N2C=CN=N2
