In [1]:
def auto_fill(file_path, sheet_name):
    """
    Automatically fills in the blanks for the specified columns in an Excel sheet.
    
    Parameters:
    - file_path: Path to the Excel file.
    - sheet_name: Name of the sheet to process.
    
    Returns:
    - Updated DataFrame with filled values.
    """
    import pandas as pd
    
    # Load the data from the specified sheet
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Columns to fill
    columns_to_fill = [
        "Common names", "CAS number", "SMILES code (SciFinder)", 
        "Canonical SMILES (PubChem)", "Input IUPAC name (cactus)", "Input SMILES (cactus)"
    ]
    
    # Forward fill the specified columns
    data[columns_to_fill] = data[columns_to_fill].ffill()
    
    # Save the updated data back to the same Excel file
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a') as writer:
        data.to_excel(writer, sheet_name=sheet_name, index=False)
    
    return data

#updated_data = auto_fill("path_to_your_file.xlsx", "sheet name")


In [49]:
import pandas as pd
import requests
import time

def fetch_pubchem_data(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path, engine='openpyxl')
    
    # Create empty lists to store the results
    iupac_names = [""] * len(df)
    canonical_smiles_list = [""] * len(df)
    
    def fetch_data_for_smiles(index, smiles):
        # Directly assign "Invalid" if the input is "Invalid"
        if smiles == "Invalid":
            iupac_names[index] = "Invalid"
            canonical_smiles_list[index] = "Invalid"
            return
        
        try:
            # Make the API call
            response = requests.get(base_url.format(smiles))
            data = response.json()
            
            # Extract the IUPAC name and canonical SMILES, or assign default values if not found
            iupac_name = data["PropertyTable"]["Properties"][0].get("IUPACName", "Unknown")
            canonical_smiles = data["PropertyTable"]["Properties"][0].get("CanonicalSMILES", "Unknown")
            
            iupac_names[index] = iupac_name
            canonical_smiles_list[index] = canonical_smiles
        except Exception as e:
            print(f"Error processing row {index + 1} with SMILES: {smiles}. Error: {e}")
            iupac_names[index] = "Error"
            canonical_smiles_list[index] = "Error"
    
    # Base URL for PubChem API
    base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{}/property/IUPACName,CanonicalSMILES/JSON"
    
    # Process each row in the "Output cactus SMILES" column for the first time
    for index, smiles in enumerate(df["Output SMILES"]):
        print(f"Processing row {index + 1} with SMILES: {smiles}")  # Print progress
        fetch_data_for_smiles(index, smiles)
    
    # Retry for errors
    for _ in range(2):  # Two more attempts
        error_indices = [i for i, name in enumerate(iupac_names) if name == "Error"]
        if not error_indices:
            break  # No errors, break out of the loop

        prev_error_count = len(error_indices)
        time.sleep(30)
        for index in error_indices:
            smiles = df["Output SMILES"].iloc[index]
            fetch_data_for_smiles(index, smiles)
            time.sleep(2)
        # Check if the number of errors has decreased
        current_error_count = len([i for i, name in enumerate(iupac_names) if name == "Error"])
        if current_error_count >= prev_error_count:
            break  # No improvement, break out of the loop

    # Check for remaining errors
    final_error_indices = [i for i, name in enumerate(iupac_names) if name == "Error"]
    if final_error_indices:
        error_rows = ", ".join(str(i+1) for i in final_error_indices)
        print(f"\nCompleted, but rows {error_rows} have errors and need to be verified.")
    
    # Assign the results to the dataframe
    df["Output IUPAC name"] = iupac_names
    df["Output Canonical SMILES"] = canonical_smiles_list
    
    # Save the updated dataframe to a new sheet named "output" in the same Excel file
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, sheet_name="pubchem output", index=False)

# Example usage
file_path = "Method S.xlsx"
fetch_pubchem_data(file_path)



Processing row 1 with SMILES: C1(=CN=C(C(=N1)C(=O)O[H])C(=O)O[H])C
Processing row 2 with SMILES: C1(=CN=C(C(=N1)C(=O)O[H])C(=O)O[H])O[H]
Processing row 3 with SMILES: C1(=CN=C(C(=N1)C(=O)O[H])C(=O)O[H])N([H])[H]
Processing row 4 with SMILES: C1(=CN=C(C(=N1)C(=O)O[H])C(=O)O[H])[N+](=O)[O-]
Processing row 5 with SMILES: C1=CN=C(C(=N1)C(=O)O[H])C(=O)O[H]
Processing row 6 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=CC(=C(C=C2)C(=O)O[H])C
Processing row 7 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=CC(=C(C=C2)C(=O)O[H])O
Processing row 8 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=CC(=C(C=C2)C(=O)O[H])[N]
Processing row 9 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=CC(=C(C=C2)C(=O)O[H])[N+](=O)[O-]
Processing row 10 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=CC(=C(C=C2)C(=O)O[H])[F]
Processing row 11 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=C(C=C(C=C2)C(=O)O[H])C
Processing row 12 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=C(C=C(C=C2)C(=O)O[H])O
Processing row 13 with SMILES: O=C(O[H])C1=CC=C(C=C1)C2=C(C=C(C=C2)C(=O)O[H])[N]
P

Processing row 110 with SMILES: O(C(=O)C2=C1C=C(C=CC1=C(C=C2)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 111 with SMILES: O(C(=O)C2=C1C=C(C=CC1=C(C=C2)C(O[H])=O)[F])[H]
Processing row 112 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC(=C(C=C2)C(O[H])=O)[CH3])[H]
Processing row 113 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC(=C(C=C2)C(O[H])=O)[OH])[H]
Processing row 114 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC(=C(C=C2)C(O[H])=O)[NH2])[H]
Processing row 115 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC(=C(C=C2)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 116 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC(=C(C=C2)C(O[H])=O)[F])[H]
Processing row 117 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC=C(C=C2[CH3])C(O[H])=O)[H]
Processing row 118 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC=C(C=C2[OH])C(O[H])=O)[H]
Processing row 119 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC=C(C=C2[NH2])C(O[H])=O)[H]
Processing row 120 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC=C(C=C2[N+](=O)[O-])C(O[H])=O)[H]
Processing row 121 with SMILES: O(C(=O)C1=CN=C(C=C1)C2=NC

Processing row 222 with SMILES: CC(=O)N(C1=CC(=C(C(=C1)C(O[H])=O)[OH])C(O[H])=O)[H]
Processing row 223 with SMILES: CC(=O)N(C1=CC(=C(C(=C1)C(O[H])=O)[NH2])C(O[H])=O)[H]
Processing row 224 with SMILES: CC(=O)N(C1=CC(=C(C(=C1)C(O[H])=O)[N+](=O)[O-])C(O[H])=O)[H]
Processing row 225 with SMILES: CC(=O)N(C1=CC(=C(C(=C1)C(O[H])=O)[F])C(O[H])=O)[H]
Processing row 226 with SMILES: O(C(=O)C1=C(C(=C(C=C1)C(O[H])=O)Br)[CH3])[H]
Processing row 227 with SMILES: O(C(=O)C1=C(C(=C(C=C1)C(O[H])=O)Br)[OH])[H]
Processing row 228 with SMILES: O(C(=O)C1=C(C(=C(C=C1)C(O[H])=O)Br)[NH2])[H]
Processing row 229 with SMILES: O(C(=O)C1=C(C(=C(C=C1)C(O[H])=O)Br)[N+](=O)[O-])[H]
Processing row 230 with SMILES: O(C(=O)C1=C(C(=C(C=C1)C(O[H])=O)Br)[F])[H]
Processing row 231 with SMILES: O(C(=O)C1=CC(=C(C(=C1)[CH3])C(O[H])=O)Br)[H]
Processing row 232 with SMILES: O(C(=O)C1=CC(=C(C(=C1)[OH])C(O[H])=O)Br)[H]
Processing row 233 with SMILES: O(C(=O)C1=CC(=C(C(=C1)[NH2])C(O[H])=O)Br)[H]
Processing row 234 with SMILES: O(C(=

Processing row 344 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[NH2])[H]
Processing row 345 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 346 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[F])[H]
Processing row 347 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[CH3])[H]
Processing row 348 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[OH])[H]
Processing row 349 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[NH2])[H]
Processing row 350 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 351 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[F])[H]
Processing row 352 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[CH3])[H]
Processing row 353 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CN=C2)C3=CC=C(C=C3)C(O[H])=O)[OH])[H]
Pr

Processing row 431 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=C(C(=C2)C(O[H])=O)[N+](=O)[O-])C(O[H])=O)[H]
Processing row 432 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=C(C(=C2)C(O[H])=O)[F])C(O[H])=O)[H]
Processing row 433 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[CH3])C(O[H])=O)C(O[H])=O)[H]
Processing row 434 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[OH])C(O[H])=O)C(O[H])=O)[H]
Processing row 435 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[NH2])C(O[H])=O)C(O[H])=O)[H]
Processing row 436 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[N+](=O)[O-])C(O[H])=O)C(O[H])=O)[H]
Processing row 437 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[F])C(O[H])=O)C(O[H])=O)[H]
Processing row 438 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CC(=C2)C(O[H])=O)C(O[H])=O)[CH3])[H]
Processing row 439 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CC(=C2)C(O[H])=O)C(O[H])=O)[OH])[H]
Processing row 440 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=CC(=CC(=C2)C(O[H])=O)C(O[H])=O)[NH2])[H]
Processing row 441 with SMILES: 

Processing row 510 with SMILES: O(C(=O)C1=CC(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C(=C3)[CH3])C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 511 with SMILES: O(C(=O)C1=CC(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C(=C3)[OH])C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 512 with SMILES: O(C(=O)C1=CC(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C(=C3)[NH2])C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 513 with SMILES: O(C(=O)C1=CC(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C(=C3)[N+](=O)[O-])C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 514 with SMILES: O(C(=O)C1=CC(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C(=C3)[F])C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 515 with SMILES: O(C(=O)C1=C(C(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C=C3)C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[CH3])[H]
Processing row 516 with SMILES: O(C(=O)C1=C(C(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C=C3)C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[OH])[H]
Processing row 517 with SMILES: O(C(=O)C1=C(C(=CC(=C1)C(O[H])=O)C3=C2C=CC=NC2=C(C=C3)C4=CC(=CC

Processing row 583 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2[NH2])C(O[H])=O)[H]
Processing row 584 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2[N+](=O)[O-])C(O[H])=O)[H]
Processing row 585 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C=C2[F])C(O[H])=O)[H]
Processing row 586 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C(=C2)[CH3])C(O[H])=O)[H]
Processing row 587 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C(=C2)[OH])C(O[H])=O)[H]
Processing row 588 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C(=C2)[NH2])C(O[H])=O)[H]
Processing row 589 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)[H]
Processing row 590 with SMILES: O(C(=O)C1=CC=C(C=C1)C(=O)C2=CC=C(C(=C2)[F])C(O[H])=O)[H]
Processing row 591 with SMILES: O(C(=O)C1=CC(=C(C=C1)[S](=O)(=O)C2=CC=C(C=C2)C(O[H])=O)[CH3])[H]
Processing row 592 with SMILES: O(C(=O)C1=CC(=C(C=C1)[S](=O)(=O)C2=CC=C(C=C2)C(O[H])=O)[OH])[H]
Processing row 593 with SMILES: O(C(=O)C1=CC(=C(C=C1)[S](=O)(=O)C2=CC=C(C=C2

Processing row 663 with SMILES: O(C(=O)C1=CC(=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[F])[H]
Processing row 664 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[CH3])[H]
Processing row 665 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[OH])[H]
Processing row 666 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[NH2])[H]
Processing row 667 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[N+](=O)[O-])[H]
Processing row 668 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=C(C=C(C(=C2)CN=[N+]=[N-])C3=CC=C(C=C3)C(O[H])=O)CN=[N+]=[N-])[F])[H]
Processing row 669 with SMILES: CC1=C(C(=C(C(=C1)C2=CC=C(C=C2)C(O[H])=O)C)C)C3=CC=C(C=C3)C(O[H])=O
Processing row 670 with SMILES: CC1=C(C(=C(C(=C1C)C2=CC=C(C=C2[CH3])C(O[H])=O)C)C)C3=CC=C(C=C3)C(O[H])=O
Processing row 671 with SMILES: 

Processing row 747 with SMILES: O(C(=O)C1=C(C=C(C=C1)N=NC2=CC=C(C=C2)C(O[H])=O)[OH])[H]
Processing row 748 with SMILES: O(C(=O)C1=C(C=C(C=C1)N=NC2=CC=C(C=C2)C(O[H])=O)[NH2])[H]
Processing row 749 with SMILES: O(C(=O)C1=C(C=C(C=C1)N=NC2=CC=C(C=C2)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 750 with SMILES: O(C(=O)C1=C(C=C(C=C1)N=NC2=CC=C(C=C2)C(O[H])=O)[F])[H]
Processing row 751 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C=C(C=CC2=C(C4=CC=CC=C34)C5=CC=C(C=C5)C(O[H])=O)[CH3])[H]
Processing row 752 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C=C(C=CC2=C(C4=CC=CC=C34)C5=CC=C(C=C5)C(O[H])=O)[OH])[H]
Processing row 753 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C=C(C=CC2=C(C4=CC=CC=C34)C5=CC=C(C=C5)C(O[H])=O)[NH2])[H]
Processing row 754 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C=C(C=CC2=C(C4=CC=CC=C34)C5=CC=C(C=C5)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 755 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C=C(C=CC2=C(C4=CC=CC=C34)C5=CC=C(C=C5)C(O[H])=O)[F])[H]
Processing row 756 with SMILES: O(C(=O)C1=CC=C(C=C1)C3=C2C(=C

Processing row 821 with SMILES: N(C1=C(C=CC(=C1)C(O[H])=O)C=CC2=C(C=C(C(=C2)[F])C(O[H])=O)N([H])[H])([H])[H]
Processing row 822 with SMILES: O(C(=O)C1=CC(=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[CH3])[H]
Processing row 823 with SMILES: O(C(=O)C1=CC(=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[OH])[H]
Processing row 824 with SMILES: O(C(=O)C1=CC(=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[NH2])[H]
Processing row 825 with SMILES: O(C(=O)C1=CC(=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[N+](=O)[O-])[H]
Processing row 826 with SMILES: O(C(=O)C1=CC(=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[F])[H]
Processing row 827 with SMILES: O(C(=O)C1=C(C=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[CH3])[H]
Processing row 828 with SMILES: O(C(=O)C1=C(C=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[OH])[H]
Processing row 829 with SMILES: O(C(=O)C1=C(C=C(C=C1)N(C(=O)N(C2=CC=C(C=C2)C(O[H])=O)[H])[H])[NH2])[H]
Processing row 830 with SMILES: O(C(=O)C1=C(C=C(C=C1)N(C(=O)N(C2

Processing row 903 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=C(C6=CC=C(C=C6C=C7)C(O[H])=O)[F]
Processing row 904 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=CC6=C(C=C(C=C6C=C7)C(O[H])=O)[CH3]
Processing row 905 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=CC6=C(C=C(C=C6C=C7)C(O[H])=O)[OH]
Processing row 906 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=CC6=C(C=C(C=C6C=C7)C(O[H])=O)[NH2]
Processing row 907 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=CC6=C(C=C(C=C6C=C7)C(O[H])=O)[N+](=O)[O-]
Processing row 908 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC4=CC=C(C=C4C=C5)C(O[H])=O)C)C7=CC6=C(C=C(C=C6C=C7)C(O[H])=O)[F]
Processing row 909 with SMILES: CC1=C(C(=C(C(=C1C3=CC2=CC=C(C=C2C=C3)C(O[H])=O)C)C5=CC

Processing row 961 with SMILES: N(C1=C(C=CC(=C1)C2=NC(=NC(=N2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C(=C4)[OH])C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 962 with SMILES: N(C1=C(C=CC(=C1)C2=NC(=NC(=N2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C(=C4)[NH2])C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 963 with SMILES: N(C1=C(C=CC(=C1)C2=NC(=NC(=N2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C(=C4)[N+](=O)[O-])C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 964 with SMILES: N(C1=C(C=CC(=C1)C2=NC(=NC(=N2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C(=C4)[F])C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 965 with SMILES: O(C(=O)C1=C(C=C(C=C1F)C2=CC(=CC(=C2)C3=CC(=C(C=C3)C(O[H])=O)F)C4=CC(=C(C(=C4)F)C(O[H])=O)F)F)[H]
Processing row 966 with SMILES: O(C(=O)C1=C(C=C(C=C1F)C2=CC(=CC(=C2)C3=CC(=C(C(=C3[CH3])F)C(O[H])=O)F)C4=CC(=C(C(=C4)F)C(O[H])=O)F)F)[H]
Processing row 967 with SMILES: O(C(=O)C1=C(C=C(C=C1F)C2=CC(=CC(=C2)C3=CC(=C(C(=C3[OH])F)C(O[H])=O)F)C4=CC(=C(C(=C4)F)C(O[H])=O)F)F)[H]
Processing row 968 with SMILES: O(C(=O)C1=C(C=C(C=C1F)C2=C

Processing row 1019 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C(=C3)[NH2])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1020 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C(=C3)[N+](=O)[O-])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1021 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C(=C3)[F])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1022 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C=C3[CH3])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1023 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C=C3[OH])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1024 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C=C3[NH2])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1025 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C2)C3=CC(=C(C=C3[N+](=O)[O-])C(O[H])=O)F)C4=CC(=C(C=C4)C(O[H])=O)F)F)[H]
Processing row 1026 with SMILES: O(C(=O)C1=C(C=C(C=C1)C2=CC(=CC(=C

Processing row 1080 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2[OH])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1081 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2[NH2])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1082 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2[N+](=O)[O-])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1083 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2[F])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1084 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2)C3=C(C=C(C=C3)C(O[H])=O)[CH3])C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1085 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2)C3=C(C=C(C=C3)C(O[H])=O)[OH])C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Processing row 1086 with SMILES: N(C1=C(C=CC(=C1)C2=CC(=CC(=C2)C3=C(C=C(C=C3)C(O[H])=O)[NH2])C5=C4C=CC=CC4=C(C=C5)C(O[H])=O)C(O[H])=O)([H])[H]
Proc

Processing row 1136 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C(=C4)[NH2])C5=CC=C(C=C5)C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1137 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C(=C4)[N+](=O)[O-])C5=CC=C(C=C5)C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1138 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C(=C4)[F])C5=CC=C(C=C5)C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1139 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C=C4)C5=CC=C(C=C5[CH3])C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1140 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C=C4)C5=CC=C(C=C5[OH])C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1141 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C4=CC=C(C=C4)C5=CC=C(C=C5[NH2])C(O[H])=O)C6=CC=C(C=C6)C7=CC=C(C=C7)C(O[H])=O)[H]
Processing row 1142 with

Processing row 1202 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2[N+](=O)[O-])C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1203 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2[F])C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1204 with SMILES: O(C(=O)C1=CC=C(C(=C1)[CH3])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1205 with SMILES: O(C(=O)C1=CC=C(C(=C1)[OH])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1206 with SMILES: O(C(=O)C1=CC=C(C(=C1)[NH2])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1207 with SMILES: O(C(=O)C1=CC=C(C(=C1)[N+](=O)[O-])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1208 with SMILES: O(C(=O)C1=CC=C(C(=C1)[F])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1209 with SMILES: O(C(=O)C1=CC=C(C=C1[CH3])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
Processing row 1210 with SMILES: O(C(=O)C1=CC=C(C=C1[OH])C2=CC=C(C=C2)C3=CC(=CC(=C3)C(O[H])=O)C(O[H])=O)[H]
P

Processing row 1273 with SMILES: CC4(C)C1=C(C=CC(=C1)C(O[H])=O)C5=C2C(C7=C(C2=C3C(C6=C(C3=C45)C=CC(=C6)C(O[H])=O)(C)C)C(=CC(=C7)C(O[H])=O)[F])(C)C
Processing row 1274 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C=C3[CH3])C(O[H])=O)[H]
Processing row 1275 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C=C3[OH])C(O[H])=O)[H]
Processing row 1276 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C=C3[NH2])C(O[H])=O)[H]
Processing row 1277 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C=C3[N+](=O)[O-])C(O[H])=O)[H]
Processing row 1278 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C=C3[F])C(O[H])=O)[H]
Processing row 1279 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C(=C3)[CH3])C(O[H])=O)[H]
Processing row 1280 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C(=C3)[OH])C(O[H])=O)[H]
Processing row 1281 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=

Processing row 1341 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4[CH3])C(O[H])=O)C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1342 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4[OH])C(O[H])=O)C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1343 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4[NH2])C(O[H])=O)C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1344 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4[N+](=O)[O-])C(O[H])=O)C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1345 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4[F])C(O[H])=O)C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1346 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)[CH3])C5=CC=C(C=C5)C(O[H])=O)[H]
Processing row 1347 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)[OH

Processing row 1396 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=C(C=C3)C4=CC=C(C=C4)C(O[H])=O)=C(C5=CC=C(C=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C(=C7)[CH3])C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1397 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=C(C=C3)C4=CC=C(C=C4)C(O[H])=O)=C(C5=CC=C(C=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C(=C7)[OH])C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1398 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=C(C=C3)C4=CC=C(C=C4)C(O[H])=O)=C(C5=CC=C(C=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C(=C7)[NH2])C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1399 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=C(C=C3)C4=CC=C(C=C4)C(O[H])=O)=C(C5=CC=C(C=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C(=C7)[N+](=O)[O-])C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1400 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=C(C=C3)C4=CC=C(C=C4)C(O[H])=O)=C(C5=CC=C(C=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C(=C7)[F])C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1401 with SMILES: O(C(=O)C1=CC=

Processing row 1443 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=C(C=C(C=C5)C(O[H])=O)[NH2])[H]
Processing row 1444 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=C(C=C(C=C5)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 1445 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=C(C=C(C=C5)C(O[H])=O)[F])[H]
Processing row 1446 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=CC(=C(C=C5)C(O[H])=O)[CH3])[H]
Processing row 1447 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=CC(=C(C=C5)C(O[H])=O)[OH])[H]
Processing row 1448 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C3=CC=C(C=C3)C(O[H])=O)C4=CC=C(C=C4)C(O[H])=O)Br)C5=CC(=C(C=C5)C(O[H])=O)[NH2])[H]
Processing row 1449 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=C(C(=C(C(=C2Br)C

Processing row 1506 with SMILES: O(C(=O)C1=CC=C(C=C1)OCC2=C(C=C(C(=C2)COC3=CC=C(C=C3)C(O[H])=O)COC4=CC=C(C=C4)C(O[H])=O)COC5=CC(=C(C=C5)C(O[H])=O)[CH3])[H]
Processing row 1507 with SMILES: O(C(=O)C1=CC=C(C=C1)OCC2=C(C=C(C(=C2)COC3=CC=C(C=C3)C(O[H])=O)COC4=CC=C(C=C4)C(O[H])=O)COC5=CC(=C(C=C5)C(O[H])=O)[OH])[H]
Processing row 1508 with SMILES: O(C(=O)C1=CC=C(C=C1)OCC2=C(C=C(C(=C2)COC3=CC=C(C=C3)C(O[H])=O)COC4=CC=C(C=C4)C(O[H])=O)COC5=CC(=C(C=C5)C(O[H])=O)[NH2])[H]
Processing row 1509 with SMILES: O(C(=O)C1=CC=C(C=C1)OCC2=C(C=C(C(=C2)COC3=CC=C(C=C3)C(O[H])=O)COC4=CC=C(C=C4)C(O[H])=O)COC5=CC(=C(C=C5)C(O[H])=O)[N+](=O)[O-])[H]
Processing row 1510 with SMILES: O(C(=O)C1=CC=C(C=C1)OCC2=C(C=C(C(=C2)COC3=CC=C(C=C3)C(O[H])=O)COC4=CC=C(C=C4)C(O[H])=O)COC5=CC(=C(C=C5)C(O[H])=O)[F])[H]
Processing row 1511 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2)C3=CC=C(C=C3)C(O[H])=O)C4=CC(=C(C=C4)C5=CC(=CC(=C5)C6=CC=C(C=C6)C(O[H])=O)C7=CC=C(C=C7)C(O[H])=O)[CH3])[H]
Processing row 1512 with SMILES: O(C(=O)C1

Processing row 1553 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)[NH2])C5=C4C=CC=CC4=C(C=C5)C6=CC(=CC(=C6)C7=CC=C(C=C7)C(O[H])=O)C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1554 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)[N+](=O)[O-])C5=C4C=CC=CC4=C(C=C5)C6=CC(=CC(=C6)C7=CC=C(C=C7)C(O[H])=O)C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1555 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=C(C(=C2)C3=CC=C(C=C3)C(O[H])=O)[F])C5=C4C=CC=CC4=C(C=C5)C6=CC(=CC(=C6)C7=CC=C(C=C7)C(O[H])=O)C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1556 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[CH3])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C6=CC(=CC(=C6)C7=CC=C(C=C7)C(O[H])=O)C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1557 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[OH])C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C=C5)C6=CC(=CC(=C6)C7=CC=C(C=C7)C(O[H])=O)C8=CC=C(C=C8)C(O[H])=O)[H]
Processing row 1558 with SMILES: O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[NH2])C3=CC=C(C=C3)C(O[H

Processing row 1597 with SMILES: O(C(=O)C1=CC=C(C=C1[OH])C2=CC(=CC(=C2)C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C6=CC=CC=C56)C7=CC(=CC(=C7)C8=CC=C(C=C8)C(O[H])=O)C9=CC=C(C=C9)C(O[H])=O)[H]
Processing row 1598 with SMILES: O(C(=O)C1=CC=C(C=C1[NH2])C2=CC(=CC(=C2)C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C6=CC=CC=C56)C7=CC(=CC(=C7)C8=CC=C(C=C8)C(O[H])=O)C9=CC=C(C=C9)C(O[H])=O)[H]
Processing row 1599 with SMILES: O(C(=O)C1=CC=C(C=C1[N+](=O)[O-])C2=CC(=CC(=C2)C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C6=CC=CC=C56)C7=CC(=CC(=C7)C8=CC=C(C=C8)C(O[H])=O)C9=CC=C(C=C9)C(O[H])=O)[H]
Processing row 1600 with SMILES: O(C(=O)C1=CC=C(C=C1[F])C2=CC(=CC(=C2)C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C6=CC=CC=C56)C7=CC(=CC(=C7)C8=CC=C(C=C8)C(O[H])=O)C9=CC=C(C=C9)C(O[H])=O)[H]
Processing row 1601 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=CC=C(C(=C3)[CH3])C4=CC=C(C=C4)N(C5=CC=C(C=C5)C(O[H])=O)C6=CC=C(C=C6)C(O[H])=O)[H]
Processing row 1602 with SMILES: O(C(=O)C1=CC=C(C=C1)N(C2=CC=C(C=C2)C(O[H])=O)C3=

Processing row 1651 with SMILES: O(C(=O)C1=CC(=CC(=C1[NH2])C(O[H])=O)C2=CC=C(C=C2)C3=CC=C(C=C3)C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 1652 with SMILES: O(C(=O)C1=CC(=CC(=C1[N+](=O)[O-])C(O[H])=O)C2=CC=C(C=C2)C3=CC=C(C=C3)C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 1653 with SMILES: O(C(=O)C1=CC(=CC(=C1[F])C(O[H])=O)C2=CC=C(C=C2)C3=CC=C(C=C3)C4=CC(=CC(=C4)C(O[H])=O)C(O[H])=O)[H]
Processing row 1654 with SMILES: O(C(=O)C1=C(C(=C(C(=C1)N=NC2=CC(=C(C(=C2)C(O[H])=O)C(O[H])=O)C(O[H])=O)[CH3])C(O[H])=O)C(O[H])=O)[H]
Processing row 1655 with SMILES: O(C(=O)C1=C(C(=C(C(=C1)N=NC2=CC(=C(C(=C2)C(O[H])=O)C(O[H])=O)C(O[H])=O)[OH])C(O[H])=O)C(O[H])=O)[H]
Processing row 1656 with SMILES: O(C(=O)C1=C(C(=C(C(=C1)N=NC2=CC(=C(C(=C2)C(O[H])=O)C(O[H])=O)C(O[H])=O)[NH2])C(O[H])=O)C(O[H])=O)[H]
Processing row 1657 with SMILES: O(C(=O)C1=C(C(=C(C(=C1)N=NC2=CC(=C(C(=C2)C(O[H])=O)C(O[H])=O)C(O[H])=O)[N+](=O)[O-])C(O[H])=O)C(O[H])=O)[H]
Processing row 1658 with SMILES: O(C(=O)C1=C(C(=C(C(=C

Error processing row 1706 with SMILES: N(C1=C(C=CC(=C1[CH3])C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1707 with SMILES: N(C1=C(C=CC(=C1[OH])C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]
Error processing row 1707 with SMILES: N(C1=C(C=CC(=C1[OH])C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1708 with SMILES: N(C1=C(C=CC(=C1[NH2])C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]
Error processing row 1708 with SMILES: N(C1=C(C=CC(=C1[NH2])C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1709 with SMILES: N(C1=C(C=CC(=C1[N+]

Error processing row 1729 with SMILES: N(C1=C(C=CC(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C(=C3)[N+](=O)[O-])C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1730 with SMILES: N(C1=C(C=CC(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C(=C3)[F])C(O[H])=O)([H])[H]
Error processing row 1730 with SMILES: N(C1=C(C=CC(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)C#CC3=CC=C(C(=C3)[F])C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1731 with SMILES: N(C1=C(C=CC(=C1)C#CC2=C(C=C(C=C2)C(O[H])=O)[CH3])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]
Error processing row 1731 with SMILES: N(C1=C(C=CC(=C1)C#CC2=C(C=C(C=C2)C(O[H])=O)[CH3])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1732 with SMILES: N(C1

Error processing row 1753 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[NH2])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1754 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1754 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1755 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[F])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1755 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[F])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1756 with SMILES: COC1=C(C(=C(C(=C1)C#CC

Error processing row 1776 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C=C3[CH3])C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1777 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C=C3[OH])C(O[H])=O
Error processing row 1777 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C=C3[OH])C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1778 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C=C3[NH2])C(O[H])=O
Error processing row 1778 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C=C3[NH2])C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1779 with SMILES: COC1=C(C=C(C(=

Error processing row 1798 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[NH2])C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1799 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[N+](=O)[O-])C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1799 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[N+](=O)[O-])C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1800 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[F])C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1800 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[F])C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGR

Error processing row 1819 with SMILES: CCCCCOC1=C(C=C(C(=C1[N+](=O)[O-])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1820 with SMILES: CCCCCOC1=C(C=C(C(=C1[F])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1820 with SMILES: CCCCCOC1=C(C=C(C(=C1[F])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1821 with SMILES: CCCCCOC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[CH3])C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1821 with SMILES: CCCCCOC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2[CH3])C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1822 with SMILES:

Error processing row 1841 with SMILES: CCCCCCCCCCCCC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[CH3])C(O[H])=O)CCCCCCCCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1842 with SMILES: CCCCCCCCCCCCC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[OH])C(O[H])=O)CCCCCCCCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1842 with SMILES: CCCCCCCCCCCCC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[OH])C(O[H])=O)CCCCCCCCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1843 with SMILES: CCCCCCCCCCCCC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[NH2])C(O[H])=O)CCCCCCCCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O
Error processing row 1843 with SMILES: CCCCCCCCCCCCC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[NH2])C(O[H])=O)CCCCCCCCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output f

Error processing row 1863 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[NH2])C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1864 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O
Error processing row 1864 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1865 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[F])C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O
Error processing row 1865 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[F])C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0


Error processing row 1886 with SMILES: O(C(=O)C1=CC=C(C(=C1)[CH3])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1887 with SMILES: O(C(=O)C1=CC=C(C(=C1)[OH])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]
Error processing row 1887 with SMILES: O(C(=O)C1=CC=C(C(=C1)[OH])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1888 with SMILES: O(C(=O)C1=CC=C(C(=C1)[NH2])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]
Error processing row 1888 with SMILES: O(C(=O)C1=CC=C(C(=C1)[NH2])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 18

Error processing row 1908 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C(=C3)[NH2])C(O[H])=O)C#CC4=CC=C(C=C4)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1909 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C(=C3)[N+](=O)[O-])C(O[H])=O)C#CC4=CC=C(C=C4)C(O[H])=O)[H]
Error processing row 1909 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C(=C3)[N+](=O)[O-])C(O[H])=O)C#CC4=CC=C(C=C4)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1910 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C(=C3)[F])C(O[H])=O)C#CC4=CC=C(C=C4)C(O[H])=O)[H]
Error processing row 1910 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C(=C3)[F])C(O[H])=O)C#CC4=CC=C(C=C4)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest


Error processing row 1926 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=CC(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)C#CC6=CC(=C(C=C6)C(O[H])=O)[CH3])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1927 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=CC(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)C#CC6=CC(=C(C=C6)C(O[H])=O)[OH])[H]
Error processing row 1927 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=CC(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)C#CC6=CC(=C(C=C6)C(O[H])=O)[OH])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1928 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=CC(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)C#CC6=CC(=C(C=C6)C(O[H])=O)[NH2])[H]
Error processing row 1928 with SMILES: O(C(=O)C1=CC=C(C=C1)C

Error processing row 1943 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[NH2])C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1944 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[N+](=O)[O-])C#CC8=CC=C(C=C8)C(O[H])=O)[H]
Error processing row 1944 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[N+](=O)[O-])C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1945 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[F])C#CC8=CC=C(C=C8)C(O[H])=O)[H]
Error 

Error processing row 1960 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)C(C3=CC=C(C=C3)C#CC4=CC=C(C=C4)C(O[H])=O)(C5=CC(=C(C=C5)C#CC6=CC=C(C=C6)C(O[H])=O)[F])C7=CC=C(C=C7)C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1961 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)C(C3=CC=C(C=C3)C#CC4=CC=C(C=C4)C(O[H])=O)(C5=CC=C(C=C5)C#CC6=C(C=C(C=C6)C(O[H])=O)[CH3])C7=CC=C(C=C7)C#CC8=CC=C(C=C8)C(O[H])=O)[H]
Error processing row 1961 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)C(C3=CC=C(C=C3)C#CC4=CC=C(C=C4)C(O[H])=O)(C5=CC=C(C=C5)C#CC6=C(C=C(C=C6)C(O[H])=O)[CH3])C7=CC=C(C=C7)C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1962 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)C(C3=CC=C(C=C3)C#CC4=CC=C(C=C4)C(O[H])=O)(C5=CC=C(C=C5)C#CC6=C(C=C(C=C6)C(O[H])=O)[

Error processing row 1977 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[OH])C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1978 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[NH2])C(O[H])=O)[H]
Error processing row 1978 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[NH2])C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Processing row 1979 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[N+](=O)[O-])C(O[H])=O)[H]
Error processing row 1979 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[N+](=O)[O-])C(O[H])=O)[H]. Error: [Er

Error processing row 1716 with SMILES: N(C1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)[CH3])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1717 with SMILES: N(C1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)[OH])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1718 with SMILES: N(C1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)[NH2])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1719 with SMILES: N(C1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)[N+](=O)[O-])C#CC3=CC=C(C=C3)C(O[H])=O)([H])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1720 

Error processing row 1751 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[CH3])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1752 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[OH])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1753 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[NH2])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1754 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C(=C2)[N+](=O)[O-])C(O[H])=O)OC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1755 with SMILES: COC1=C(

Error processing row 1785 with SMILES: COC1=C(C=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[H])C#CC3=CC=C(C(=C3)[F])C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1786 with SMILES: COC1=C(C(=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)[CH3])C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1787 with SMILES: COC1=C(C(=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)[OH])C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1788 with SMILES: COC1=C(C(=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCC#C[Si](C(C)C)(C(C)C)C(C)C)[NH2])C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output forma

Error processing row 1817 with SMILES: CCCCCOC1=C(C=C(C(=C1[OH])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1818 with SMILES: CCCCCOC1=C(C=C(C(=C1[NH2])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1819 with SMILES: CCCCCOC1=C(C=C(C(=C1[N+](=O)[O-])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1820 with SMILES: CCCCCOC1=C(C=C(C(=C1[F])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCCC)C#CC3=CC=C(C=C3)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 182

Error processing row 1850 with SMILES: COC1=C(C(=C(C(=C1)C#CC2=CC=C(C=C2)C(O[H])=O)OCCCC3=CC=CO3)[F])C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1851 with SMILES: COC1=C(C=C(C(=C1[CH3])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1852 with SMILES: COC1=C(C=C(C(=C1[OH])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1853 with SMILES: COC1=C(C=C(C(=C1[NH2])C#CC2=CC=C(C=C2)C(O[H])=O)OCCCC3=CC=CO3)C#CC4=CC=C(C=C4)C(O[H])=O. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing 

Error processing row 1884 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[N+](=O)[O-])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1885 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC3=C2C(=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[F])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1886 with SMILES: O(C(=O)C1=CC=C(C(=C1)[CH3])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1887 with SMILES: O(C(=O)C1=CC=C(C(=C1)[OH])C#CC3=C2C=CC=CC2=C(C4=CC=CC=C34)C#CC5=CC=C(C=C5)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error pr

Error processing row 1916 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=C(C(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)[CH3])C#CC6=CC=C(C=C6)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1917 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=C(C(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)[OH])C#CC6=CC=C(C=C6)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1918 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC(=C(C(=C4)C#CC5=CC=C(C=C5)C(O[H])=O)[NH2])C#CC6=CC=C(C=C6)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1919 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC(=CC(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=CC

Error processing row 1944 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[N+](=O)[O-])C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1945 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=C(C=C(C=C7)C(O[H])=O)[F])C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1946 with SMILES: O(C(=O)C1=C(C=C(C=C1)C#CC2=C4C=CC5=C(C=C(C6=CC=C(C(=C2)C#CC3=CC=C(C=C3)C(O[H])=O)C4=C56)C#CC7=CC=C(C=C7)C(O[H])=O)C#CC8=CC=C(C=C8)C(O[H])=O)[CH3])[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1947 with SMILES: O(C(=O)C1=C(C=C(C=C1)C#CC2=C4C=C

Error processing row 1970 with SMILES: O(C(=O)C1=CC=C(C=C1)C#CC2=CC=C(C=C2)C(C3=CC=C(C=C3)C#CC4=CC=C(C=C4)C(O[H])=O)(C5=CC=C(C=C5)C#CC6=CC(=C(C=C6)C(O[H])=O)[F])C7=CC=C(C=C7)C#CC8=CC=C(C=C8)C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1971 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C=C(C=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[CH3])C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1972 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C=C(C=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[OH])C(O[H])=O)[H]. Error: [Errno Expecting value] Status: 400
Code: PUGREST.BadRequest
Message: Output format unspecified or invalid
: 0
Error processing row 1973 with SMILES: O(C(=O)C1=CC(=CC(=C1)C#CC3=C2C=C(C=CC2=C(C4=CC=CC=C34)C#CC5=CC(=CC(=C5)C(O[H])=O)C(O[H])=O)[NH2])C(O[

In [48]:
import pandas as pd
import requests
import openpyxl
import urllib

def request_castus_SMILES(file_path, base_url="https://cactus.nci.nih.gov/chemical/structure/"):
    df = pd.read_excel(file_path, engine='openpyxl')
    
    total_rows = len(df)

    for idx, row in df.iterrows():
        # Skip if "Output cactus SMILES" is already filled
        if pd.notna(row["Output cactus SMILES"]):
            continue

        # If "Output Canonical SMILES" or "Output IUPAC name" or input is "Invalid", set "Output cactus SMILES" to "Invalid"
        if row["Output Canonical SMILES"] == "Invalid" or row["Output IUPAC name"] == "Invalid" or row['Output SMILES'] =="Invalid":
            df.at[idx, 'Output cactus SMILES'] = "Invalid"
            continue

        print(f"Processing row {idx + 1} of {total_rows}...")

        # First, try fetching with "Output Canonical SMILES"
        response = requests.get(base_url + urllib.parse.quote(row['Output SMILES']) + "/SMILES")

        # If that fails, try "Output IUPAC name"
       # if response.status_code == 404:
         #   response = requests.get(base_url + urllib.parse.quote(row['Output IUPAC name']) + "/SMILES")

        # If the response code is 200 (OK), update the "Output cactus SMILES" column
        if response.status_code == 200:
            df.at[idx, 'Output cactus SMILES'] = response.text
            #print(f"SMILES code {response.text} was found using {row['Output SMILES'] if response.text in row['Output SMILES'] else row['Output IUPAC name']}")
        else:
            df.at[idx, 'Output cactus SMILES'] = "Invalid"
            if not (row['Output SMILES'] == "Invalid"):
                print(f"SMILES code {urllib.parse.quote(row['Output SMILES'])} was not found.") 

    # Save the final DataFrame to a new Excel file
    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, sheet_name="castus output", index=False)
    print(f"Data saved to the 'castus output' sheet in {file_path}")
    

    
    
    
# Run the function with your file path
request_castus_SMILES("Method S.xlsx")




Processing row 1 of 1990...
Processing row 2 of 1990...
Processing row 3 of 1990...
Processing row 4 of 1990...
Processing row 5 of 1990...
Processing row 6 of 1990...
Processing row 7 of 1990...
Processing row 8 of 1990...
Processing row 9 of 1990...
Processing row 10 of 1990...
Processing row 11 of 1990...
Processing row 12 of 1990...
Processing row 13 of 1990...
Processing row 14 of 1990...
Processing row 15 of 1990...
Processing row 16 of 1990...
Processing row 17 of 1990...
Processing row 18 of 1990...
Processing row 19 of 1990...
Processing row 20 of 1990...
Processing row 21 of 1990...
Processing row 22 of 1990...
Processing row 23 of 1990...
Processing row 24 of 1990...
Processing row 25 of 1990...
Processing row 26 of 1990...
Processing row 27 of 1990...
Processing row 28 of 1990...
Processing row 29 of 1990...
Processing row 30 of 1990...
Processing row 31 of 1990...
Processing row 32 of 1990...
Processing row 33 of 1990...
Processing row 34 of 1990...
Processing row 35 of 19

Processing row 345 of 1990...
Processing row 346 of 1990...
Processing row 347 of 1990...
Processing row 348 of 1990...
Processing row 349 of 1990...
Processing row 350 of 1990...
Processing row 351 of 1990...
Processing row 352 of 1990...
Processing row 353 of 1990...
Processing row 354 of 1990...
Processing row 355 of 1990...
Processing row 356 of 1990...
Processing row 357 of 1990...
Processing row 358 of 1990...
Processing row 359 of 1990...
Processing row 360 of 1990...
Processing row 361 of 1990...
Processing row 362 of 1990...
Processing row 363 of 1990...
Processing row 364 of 1990...
Processing row 365 of 1990...
Processing row 366 of 1990...
Processing row 367 of 1990...
Processing row 368 of 1990...
Processing row 369 of 1990...
Processing row 370 of 1990...
Processing row 371 of 1990...
Processing row 372 of 1990...
Processing row 373 of 1990...
Processing row 374 of 1990...
Processing row 375 of 1990...
Processing row 376 of 1990...
Processing row 377 of 1990...
Processing

Processing row 629 of 1990...
Processing row 630 of 1990...
Processing row 631 of 1990...
Processing row 632 of 1990...
Processing row 633 of 1990...
Processing row 634 of 1990...
Processing row 635 of 1990...
Processing row 636 of 1990...
Processing row 637 of 1990...
Processing row 638 of 1990...
Processing row 639 of 1990...
Processing row 640 of 1990...
Processing row 641 of 1990...
Processing row 642 of 1990...
Processing row 643 of 1990...
Processing row 644 of 1990...
Processing row 645 of 1990...
Processing row 646 of 1990...
Processing row 647 of 1990...
Processing row 648 of 1990...
Processing row 649 of 1990...
Processing row 650 of 1990...
Processing row 651 of 1990...
Processing row 652 of 1990...
Processing row 653 of 1990...
Processing row 654 of 1990...
Processing row 655 of 1990...
Processing row 656 of 1990...
Processing row 657 of 1990...
Processing row 658 of 1990...
Processing row 659 of 1990...
Processing row 660 of 1990...
Processing row 661 of 1990...
Processing

Processing row 903 of 1990...
Processing row 904 of 1990...
Processing row 905 of 1990...
Processing row 906 of 1990...
Processing row 907 of 1990...
Processing row 908 of 1990...
Processing row 909 of 1990...
Processing row 910 of 1990...
Processing row 911 of 1990...
Processing row 912 of 1990...
Processing row 913 of 1990...
Processing row 914 of 1990...
Processing row 915 of 1990...
Processing row 916 of 1990...
Processing row 917 of 1990...
Processing row 918 of 1990...
Processing row 919 of 1990...
Processing row 920 of 1990...
Processing row 921 of 1990...
Processing row 922 of 1990...
Processing row 923 of 1990...
Processing row 924 of 1990...
Processing row 925 of 1990...
Processing row 926 of 1990...
Processing row 927 of 1990...
Processing row 928 of 1990...
Processing row 929 of 1990...
Processing row 930 of 1990...
Processing row 931 of 1990...
Processing row 932 of 1990...
Processing row 933 of 1990...
Processing row 934 of 1990...
Processing row 935 of 1990...
Processing

Processing row 1171 of 1990...
Processing row 1172 of 1990...
Processing row 1173 of 1990...
Processing row 1174 of 1990...
Processing row 1175 of 1990...
Processing row 1176 of 1990...
Processing row 1177 of 1990...
Processing row 1178 of 1990...
Processing row 1179 of 1990...
Processing row 1180 of 1990...
Processing row 1181 of 1990...
Processing row 1182 of 1990...
Processing row 1183 of 1990...
Processing row 1184 of 1990...
Processing row 1185 of 1990...
Processing row 1186 of 1990...
Processing row 1187 of 1990...
Processing row 1188 of 1990...
Processing row 1189 of 1990...
Processing row 1190 of 1990...
Processing row 1191 of 1990...
Processing row 1192 of 1990...
Processing row 1193 of 1990...
Processing row 1194 of 1990...
Processing row 1195 of 1990...
Processing row 1196 of 1990...
Processing row 1197 of 1990...
Processing row 1198 of 1990...
Processing row 1199 of 1990...
Processing row 1200 of 1990...
Processing row 1201 of 1990...
Processing row 1202 of 1990...
Processi

Processing row 1436 of 1990...
Processing row 1437 of 1990...
Processing row 1438 of 1990...
Processing row 1439 of 1990...
Processing row 1440 of 1990...
Processing row 1441 of 1990...
Processing row 1442 of 1990...
Processing row 1443 of 1990...
Processing row 1444 of 1990...
Processing row 1445 of 1990...
Processing row 1446 of 1990...
Processing row 1447 of 1990...
Processing row 1448 of 1990...
Processing row 1449 of 1990...
Processing row 1450 of 1990...
Processing row 1451 of 1990...
Processing row 1452 of 1990...
Processing row 1453 of 1990...
Processing row 1454 of 1990...
Processing row 1455 of 1990...
Processing row 1456 of 1990...
Processing row 1457 of 1990...
Processing row 1458 of 1990...
Processing row 1459 of 1990...
Processing row 1460 of 1990...
Processing row 1461 of 1990...
Processing row 1462 of 1990...
Processing row 1463 of 1990...
Processing row 1464 of 1990...
Processing row 1465 of 1990...
Processing row 1466 of 1990...
Processing row 1467 of 1990...
Processi

Processing row 1701 of 1990...
Processing row 1702 of 1990...
Processing row 1703 of 1990...
Processing row 1704 of 1990...
Processing row 1705 of 1990...
Processing row 1706 of 1990...
Processing row 1707 of 1990...
Processing row 1708 of 1990...
Processing row 1709 of 1990...
Processing row 1710 of 1990...
Processing row 1711 of 1990...
Processing row 1712 of 1990...
Processing row 1713 of 1990...
Processing row 1714 of 1990...
Processing row 1715 of 1990...
Processing row 1716 of 1990...
Processing row 1717 of 1990...
Processing row 1718 of 1990...
Processing row 1719 of 1990...
Processing row 1720 of 1990...
Processing row 1721 of 1990...
Processing row 1722 of 1990...
Processing row 1723 of 1990...
Processing row 1724 of 1990...
Processing row 1725 of 1990...
Processing row 1726 of 1990...
Processing row 1727 of 1990...
Processing row 1728 of 1990...
Processing row 1729 of 1990...
Processing row 1730 of 1990...
Processing row 1731 of 1990...
Processing row 1732 of 1990...
Processi

Processing row 1966 of 1990...
Processing row 1967 of 1990...
Processing row 1968 of 1990...
Processing row 1969 of 1990...
Processing row 1970 of 1990...
Processing row 1971 of 1990...
Processing row 1972 of 1990...
Processing row 1973 of 1990...
Processing row 1974 of 1990...
Processing row 1975 of 1990...
Processing row 1976 of 1990...
Processing row 1977 of 1990...
Processing row 1978 of 1990...
Processing row 1979 of 1990...
Processing row 1980 of 1990...
Processing row 1981 of 1990...
Processing row 1982 of 1990...
Processing row 1983 of 1990...
Processing row 1984 of 1990...
Processing row 1985 of 1990...
Processing row 1986 of 1990...
Processing row 1987 of 1990...
Processing row 1988 of 1990...
Processing row 1989 of 1990...
Processing row 1990 of 1990...
Data saved to the 'output' sheet in Method S.xlsx


Data saved to the 'output' sheet in Method S.xlsx


In [50]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time, re

def smiles_to_iupac_name(KeyWord):
    
    if KeyWord =="Invalid" or "":
        return "Invalid"
    
    # Initialize Chrome browser
    browser = webdriver.Chrome()
    
    # Navigate to the specified URL
    browser.get("https://app.syntelly.com/smiles2iupac")
    
    # Find the input element on the webpage
    input_1 = browser.find_element(By.CSS_SELECTOR, 'input[aria-invalid="false"]')
    
    # Send the keyword to the input element and press Enter
    input_1.send_keys(KeyWord)
    input_1.send_keys("\n")

    # Wait for the page to load the results
    try:
        wait = WebDriverWait(browser, 10)
        div_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.sc-gQSkpc.eWIKTX')))
    except:
        return "Unknown"

    # Get the HTML content of the page
    html = browser.page_source
    try:
        # Use regex to find the string that starts with "Results: " and ends with "</h4>"
        FindStr = re.compile(r'Results: (.*?)</h4>')
        result = re.findall(FindStr, html)[0]

        # If the result is a space, return "Invalid"
        if result == " ":
            return "Unknown"

        # Find the position of the first comma in the result
        comma_index = result.find(', ')
        if comma_index != -1:
            # If a comma exists, truncate the string up to the first comma
            truncated_result = result[:comma_index]
        else:
            # If no comma exists, return the entire result string
            truncated_result = result
        return truncated_result
    except:
        return "Unknown"
    
    
#smiles_to_iupac_name("O(C(=O)C1=CC=C(C=C1)C2=CC(=CC(=C2[S](O)(=O)=O)C3=CC=C(C=C3)C(O[H])=O)C5=C4C=CC=CC4=C(C6=CC=CC=C56)C7=CC(=CC(=C7)C8=CC=C(C=C8)C(O[H])=O)C9=CC=C(C=C9)C(O[H])=O)[H]")




def generate_iupac_names(filename):
    # Load the data from the given filename
    df = pd.read_excel(filename)
    
    # Create a new column for IUPAC names if it doesn't exist
    if 'Output IUPAC name' not in df.columns:
        df['Output IUPAC name'] = ""
    
    # Counters
    total_updates = 0
    blanks_filled = 0
    unknown_or_invalid_updated = 0
    
    for i, smiles in enumerate(df['Output cactus SMILES']):
        # Check if the value is neither empty nor "Invalid"
        if pd.notna(smiles) and smiles != "Invalid":
            # Check if the 'Output IUPAC name' cell is empty, "Unknown", or "Error"
            current_iupac_name = df.loc[i, 'Output IUPAC name']
            if pd.isna(current_iupac_name) or current_iupac_name in ["Unknown", "Error"]:
                try:
                    # Generate the IUPAC name using the provided function
                    iupac_name = smiles_to_iupac_name(smiles)
                    # Assign the generated IUPAC name to the "Output IUPAC name" column in the same row
                    df.loc[i, 'Output IUPAC name'] = iupac_name
                    
                    # Update counters
                    total_updates += 1
                    if pd.isna(current_iupac_name):
                        blanks_filled += 1
                        print(f"Row {i+1}: Filled blank with {iupac_name}")
                    elif current_iupac_name in ["Unknown", "Error"]:
                        unknown_or_invalid_updated += 1
                        print(f"Row {i+1}: Updated '{current_iupac_name}' to {iupac_name}")
                    
                except Exception as e:
                    print(f"Exception at row {i}: {e}")

    # Save the modified dataframe to a new sheet called "IUPAC output" in the same Excel file
    with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, sheet_name='IUPAC output', index=False)

    # Print summary
    print(f"\nSummary:")
    print(f"Total updates made: {total_updates}")
    print(f"Number of blanks filled: {blanks_filled}")
    print(f"Number of 'Unknown' or 'Invalid' entries updated: {unknown_or_invalid_updated}")
    print(f"Data saved to the 'IUPAC output' sheet in {filename}")

# Run the function
generate_iupac_names('Method S.xlsx')



Row 4: Updated 'Unknown' to 5-nitropyrazine-2,3-dicarboxylic acid
Row 46: Updated 'Unknown' to 2,5-dihydroxy-3-methylterephthalic acid
Row 50: Updated 'Unknown' to 3-fluoro-2,5-dihydroxyterephthalic acid
Row 56: Updated 'Unknown' to 2-methylcyclopenta-1,3-diene-1,4-dicarboxylic acid
Row 57: Updated 'Unknown' to 5-hydroxycyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 58: Updated 'Unknown' to 2-aminocyclopenta-1,3-diene-1,4-dicarboxylic acid
Row 59: Updated 'Unknown' to 5-nitrocyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 60: Updated 'Unknown' to 5-fluorocyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 61: Updated 'Unknown' to 2-methylcyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 62: Updated 'Unknown' to 2-hydroxycyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 63: Updated 'Unknown' to 2-aminocyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 64: Updated 'Unknown' to 2-nitrocyclopenta-3,5-diene-1,3-dicarboxylic acid
Row 65: Updated 'Unknown' to 2-fluorocyclopenta-3,5-diene-1,3-dicarboxylic aci

Row 269: Updated 'Unknown' to 2-amino-4-(4-carboxy-3-hydroxyphenyl)-6-hydroxybenzoic acid
Row 270: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-2-hydroxy-6-nitrobenzoic acid
Row 271: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-2-fluoro-6-hydroxybenzoic acid
Row 272: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-2-hydroxy-5-methylbenzoic acid
Row 273: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-2,5-dihydroxybenzoic acid
Row 274: Updated 'Unknown' to 5-amino-4-(4-carboxy-3-hydroxyphenyl)-2-hydroxybenzoic acid
Row 275: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-2-hydroxy-5-nitrobenzoic acid
Row 276: Updated 'Unknown' to 4-(4-carboxy-3-hydroxyphenyl)-5-fluoro-2-hydroxybenzoic acid
Row 282: Updated 'Unknown' to 5-methyl-3,7,11-trithiatricyclo[6.3.0.0^2,6]undeca-1(8),2(6),4,9-tetraene-4,10-dicarboxylic acid
Row 283: Updated 'Unknown' to 5-hydroxy-3,7,11-trithiatricyclo[6.3.0.0^2,6]undeca-1(8),2(6),4,9-tetraene-4,10-dicarboxylic acid
Row 284: Updated 'Unknown'

Row 437: Updated 'Unknown' to 5-(4-carboxyphenyl)-4-fluorobenzene-1,3-dicarboxylic acid
Row 443: Updated 'Unknown' to 5-(4-carboxy-3-methylphenyl)benzene-1,3-dicarboxylic acid
Row 445: Updated 'Unknown' to 5-(3-amino-4-carboxyphenyl)benzene-1,3-dicarboxylic acid
Row 448: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-2-methylbenzoic acid
Row 449: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-2-hydroxybenzoic acid
Row 450: Updated 'Unknown' to 2-amino-3,5-bis(4-carboxyphenyl)benzoic acid
Row 451: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-2-nitrobenzoic acid
Row 452: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-2-fluorobenzoic acid
Row 453: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-4-methylbenzoic acid
Row 454: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-4-hydroxybenzoic acid
Row 456: Updated 'Unknown' to 3,5-bis(4-carboxyphenyl)-4-nitrobenzoic acid
Row 458: Updated 'Unknown' to 4-[3-carboxy-5-(4-carboxyphenyl)phenyl]-3-methylbenzoic acid
Row 459: Updated 'Unknown' to 4-[3-carb

Row 539: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)-4-fluoropyrimidin-5-yl]benzene-1,3-dicarboxylic acid
Row 540: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-4-methylbenzene-1,3-dicarboxylic acid
Row 541: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-4-hydroxybenzene-1,3-dicarboxylic acid
Row 542: Updated 'Unknown' to 4-amino-5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]benzene-1,3-dicarboxylic acid
Row 543: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-4-nitrobenzene-1,3-dicarboxylic acid
Row 544: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-4-fluorobenzene-1,3-dicarboxylic acid
Row 545: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-2-methylbenzene-1,3-dicarboxylic acid
Row 546: Updated 'Unknown' to 5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]-2-hydroxybenzene-1,3-dicarboxylic acid
Row 547: Updated 'Unknown' to 2-amino-5-[2-(3,5-dicarboxyphenyl)pyrimidin-5-yl]benzene-1,3-dicarboxylic acid
Row 548: Up

Row 646: Updated 'Unknown' to 3-amino-4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]benzoic acid
Row 647: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-3-nitrobenzoic acid
Row 648: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-3-fluorobenzoic acid
Row 649: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-2-methylbenzoic acid
Row 650: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-2-hydroxybenzoic acid
Row 651: Updated 'Unknown' to 2-amino-4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]benzoic acid
Row 652: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-2-nitrobenzoic acid
Row 653: Updated 'Unknown' to 4-[4-(4-carboxyphenyl)-2,5-dimethylphenyl]-2-fluorobenzoic acid
Row 654: Updated 'Unknown' to 4-[2,5-bis(azidomethyl)-4-(4-carboxyphenyl)-3-methylphenyl]benzoic acid
Row 655: Updated 'Unknown' to 4-[2,5-bis(azidomethyl)-4-(4-carboxyphenyl)-3-hydroxyphenyl]benzoic acid
Row 656: Updated 'Unknown' to 4-[3-amino-2,5-b

Row 727: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-2-hydroxybenzoic acid
Row 728: Updated 'Unknown' to 2-amino-4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chlorobenzoic acid
Row 729: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-2-nitrobenzoic acid
Row 730: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-2-fluorobenzoic acid
Row 731: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-5-methylbenzoic acid
Row 732: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-5-hydroxybenzoic acid
Row 733: Updated 'Unknown' to 3-amino-4-[(4-carboxy-2-chlorophenyl)diazenyl]-5-chlorobenzoic acid
Row 734: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-5-nitrobenzoic acid
Row 735: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-3-chloro-5-fluorobenzoic acid
Row 736: Updated 'Unknown' to 4-[(4-carboxy-2-chlorophenyl)diazenyl]-5-chloro-2-methylbenzoic acid
Row 737: Upd

Row 811: Updated 'Unknown' to 3-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-2-fluorobenzoic acid
Row 812: Updated 'Unknown' to 3-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-5-methylbenzoic acid
Row 813: Updated 'Unknown' to 3-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-5-hydroxybenzoic acid
Row 814: Updated 'Unknown' to 3,5-diamino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]benzoic acid
Row 815: Updated 'Unknown' to 3-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-5-nitrobenzoic acid
Row 816: Updated 'Unknown' to 3-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-5-fluorobenzoic acid
Row 817: Updated 'Unknown' to 5-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-2-methylbenzoic acid
Row 818: Updated 'Unknown' to 5-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-2-hydroxybenzoic acid
Row 819: Updated 'Unknown' to 2,5-diamino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]benzoic acid
Row 820: Updated 'Unknown' to 5-amino-4-[2-(2-amino-4-carboxyphenyl)ethenyl]-2-nitrobenzoic acid
Row 821: Updated 'Unknown' to 

Row 905: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-4-hydroxynaphthalene-2-carboxylic acid
Row 906: Updated 'Unknown' to 4-amino-6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]naphthalene-2-carboxylic acid
Row 907: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-4-nitronaphthalene-2-carboxylic acid
Row 908: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-4-fluoronaphthalene-2-carboxylic acid
Row 909: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-3-methylnaphthalene-2-carboxylic acid
Row 910: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-3-hydroxynaphthalene-2-carboxylic acid
Row 911: Updated 'Unknown' to 3-amino-6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]naphthalene-2-carboxylic acid
Row 912: Updated 'Unknown' to 6-[3,5-bis(6-carboxynaphthalen-2-yl)-2,4,6-trimethylphenyl]-3-nitronaphthal

Row 981: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-8-methylnaphthalene-1-carboxylic acid
Row 982: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-8-hydroxynaphthalene-1-carboxylic acid
Row 983: Updated 'Unknown' to 8-amino-4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]naphthalene-1-carboxylic acid
Row 984: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-8-nitronaphthalene-1-carboxylic acid
Row 985: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-8-fluoronaphthalene-1-carboxylic acid
Row 986: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-7-methylnaphthalene-1-carboxylic acid
Row 987: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-7-hydroxynaphthalene-1-carboxylic acid
Row 988: Updated 'Unknown' to 7-amino-4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]naphthalene-1-carboxylic acid
Row 989: Updated 'Unknown' to 4-[3,5-bis(4-carboxynaphthalen-1-yl)phenyl]-7-nitronaphthalene-1-carboxylic

Row 1057: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-2-nitrophenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1058: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-2-fluorophenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1059: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-5-methylphenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1060: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-5-hydroxyphenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1061: Updated 'Unknown' to 4-[3-(4-carboxyphenyl)-5-(3,5-diamino-4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1062: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-5-nitrophenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1063: Updated 'Unknown' to 4-[3-(3-amino-4-carboxy-5-fluorophenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carboxylic acid
Row 1064: Updated 'Unknown' to 4-[3-(5-amino-4-carboxy-2-methylphenyl)-5-(4-carboxyphenyl)phenyl]naphthalene-1-carbox

Row 1122: Updated 'Unknown' to 4-[3-(3-amino-4-carboxyphenyl)-5-(4-carboxyphenyl)phenyl]-5-nitronaphthalene-1-carboxylic acid
Row 1123: Updated 'Unknown' to 4-[3-(3-amino-4-carboxyphenyl)-5-(4-carboxyphenyl)phenyl]-5-fluoronaphthalene-1-carboxylic acid
Row 1124: Updated 'Unknown' to 4-[4-[3,5-bis[4-(4-carboxyphenyl)phenyl]-2-methylphenyl]phenyl]benzoic acid
Row 1126: Updated 'Unknown' to 4-[4-[4-amino-3,5-bis[4-(4-carboxyphenyl)phenyl]phenyl]phenyl]benzoic acid
Row 1127: Updated 'Unknown' to 4-[4-[3,5-bis[4-(4-carboxyphenyl)phenyl]-4-nitrophenyl]phenyl]benzoic acid
Row 1128: Updated 'Unknown' to 4-[4-[3,5-bis[4-(4-carboxyphenyl)phenyl]-4-fluorophenyl]phenyl]benzoic acid
Row 1129: Updated 'Unknown' to 4-[4-[3,5-bis[4-(4-carboxyphenyl)phenyl]phenyl]-3-methylphenyl]benzoic acid
Row 1130: Updated 'Unknown' to 4-[4-[3-[4-(4-carboxyphenyl)-2-hydroxyphenyl]-5-[4-(4-carboxyphenyl)phenyl]phenyl]phenyl]benzoic acid
Row 1131: Updated 'Unknown' to 4-[3-amino-4-[3,5-bis[4-(4-carboxyphenyl)phenyl]ph

Row 1205: Updated 'Unknown' to 5-[4-(4-carboxy-2-hydroxyphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1206: Updated 'Unknown' to 5-[4-(2-amino-4-carboxyphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1207: Updated 'Unknown' to 5-[4-(4-carboxy-2-nitrophenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1208: Updated 'Unknown' to 5-[4-(4-carboxy-2-fluorophenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1209: Updated 'Unknown' to 5-[4-(4-carboxy-3-methylphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1210: Updated 'Unknown' to 5-[4-(4-carboxy-3-hydroxyphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1211: Updated 'Unknown' to 5-[4-(3-amino-4-carboxyphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1212: Updated 'Unknown' to 5-[4-(4-carboxy-3-nitrophenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1213: Updated 'Unknown' to 5-[4-(4-carboxy-3-fluorophenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1214: Updated 'Unknown' to 5-[(4-carboxy-3-methylphenyl)methoxy]benzene-1,3-dicarboxylic acid
Row 1215: Upd

Row 1304: Updated 'Unknown' to 4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]-3-fluorobenzoic acid
Row 1305: Updated 'Unknown' to 4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]-2-methylbenzoic acid
Row 1306: Updated 'Unknown' to 4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]-2-hydroxybenzoic acid
Row 1307: Updated 'Unknown' to 2-amino-4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]benzoic acid
Row 1308: Updated 'Unknown' to 4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]-2-nitrobenzoic acid
Row 1309: Updated 'Unknown' to 4-[2-amino-3,5-bis(4-carboxyphenyl)phenyl]-2-fluorobenzoic acid
Row 1311: Updated 'Unknown' to 2-fluoro-4-methylbenzene-1,3,5-tricarboxylic acid
Row 1312: Updated 'Unknown' to 2-fluoro-4-hydroxybenzene-1,3,5-tricarboxylic acid
Row 1313: Updated 'Unknown' to 2-amino-4-fluorobenzene-1,3,5-tricarboxylic acid
Row 1314: Updated 'Unknown' to 2-fluoro-4-nitrobenzene-1,3,5-tricarboxylic acid
Row 1315: Updated 'Unknown' to 2,4-difluorobenzene-1,3,5-tricarboxylic acid
Row 1316: Updated 'Unknown' to 3

Row 1396: Updated 'Unknown' to 4-[4-[1-[4-(4-carboxyphenyl)-3-methylphenyl]-2,2-bis[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1397: Updated 'Unknown' to 4-[4-[1-[4-(4-carboxyphenyl)-3-hydroxyphenyl]-2,2-bis[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1398: Updated 'Unknown' to 4-[4-[1-[3-amino-4-(4-carboxyphenyl)phenyl]-2,2-bis[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1399: Updated 'Unknown' to 4-[4-[1-[4-(4-carboxyphenyl)-3-nitrophenyl]-2,2-bis[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1400: Updated 'Unknown' to 4-[4-[1-[4-(4-carboxyphenyl)-3-fluorophenyl]-2,2-bis[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1401: Updated 'Unknown' to 3-methyl-4-[4-[1,2,2-tris[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1402: Updated 'Unknown' to 3-hydroxy-4-[4-[1,2,2-tris[4-(4-carboxyphenyl)phenyl]ethenyl]phenyl]benzoic acid
Row 1403: Updated 'Unknown' to 3-amino-4-[4-[1,2,2-tris[4-(4-carboxyphenyl)phenyl]etheny

Row 1472: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-4-hydroxybenzene-1,3-dicarboxylic acid
Row 1473: Updated 'Unknown' to 4-amino-5-[(3,5-dicarboxyphenyl)diazenyl]benzene-1,3-dicarboxylic acid
Row 1474: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-4-nitrobenzene-1,3-dicarboxylic acid
Row 1475: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-4-fluorobenzene-1,3-dicarboxylic acid
Row 1476: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-2-methylbenzene-1,3-dicarboxylic acid
Row 1477: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-2-hydroxybenzene-1,3-dicarboxylic acid
Row 1478: Updated 'Unknown' to 2-amino-5-[(3,5-dicarboxyphenyl)diazenyl]benzene-1,3-dicarboxylic acid
Row 1479: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-2-nitrobenzene-1,3-dicarboxylic acid
Row 1480: Updated 'Unknown' to 5-[(3,5-dicarboxyphenyl)diazenyl]-2-fluorobenzene-1,3-dicarboxylic acid
Row 1483: Updated 'Unknown' to 3-amino-4-(3,4-dicarboxyphenyl)phthalic acid

Row 1549: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)phenyl]-6-nitronaphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1550: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)phenyl]-6-fluoronaphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1551: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)phenyl]naphthalen-1-yl]-5-(4-carboxyphenyl)-4-methylphenyl]benzoic acid
Row 1552: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)-2-hydroxyphenyl]naphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1553: Updated 'Unknown' to 4-[4-amino-3-[4-[3,5-bis(4-carboxyphenyl)phenyl]naphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1554: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)-2-nitrophenyl]naphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1555: Updated 'Unknown' to 4-[3-[4-[3,5-bis(4-carboxyphenyl)-2-fluorophenyl]naphthalen-1-yl]-5-(4-carboxyphenyl)phenyl]benzoic acid
Row 1556: Updated 'Unknown' to 4-[5-[4-[3,5-bis(4-

Row 1610: Updated 'Unknown' to 4-(4-carboxy-N-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)-3-fluorophenyl]phenyl]anilino)benzoic acid
Row 1611: Updated 'Unknown' to 4-(4-carboxy-N-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]phenyl]anilino)-3-methylbenzoic acid
Row 1612: Updated 'Unknown' to 4-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]-N-(4-carboxyphenyl)anilino]-3-hydroxybenzoic acid
Row 1613: Updated 'Unknown' to 3-amino-4-(4-carboxy-N-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]phenyl]anilino)benzoic acid
Row 1614: Updated 'Unknown' to 4-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]-N-(4-carboxyphenyl)anilino]-3-nitrobenzoic acid
Row 1615: Updated 'Unknown' to 4-[4-carboxy-N-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]phenyl]anilino]-3-fluorobenzoic acid
Row 1616: Updated 'Unknown' to 4-(4-carboxy-N-[4-[4-(4-carboxy-N-(4-carboxyphenyl)anilino)phenyl]phenyl]anilino)-2-methylbenzoic acid
Row 1617: Updated 'Unknown' to 4-[4-carboxy-N-[4-[4-(4-carboxy-

Row 1677: Updated 'Unknown' to 2,5-bis(2,4-dicarboxyphenyl)-3-nitroterephthalic acid
Row 1678: Updated 'Unknown' to 2,5-bis(2,4-dicarboxyphenyl)-3-fluoroterephthalic acid
Row 1679: Updated 'Unknown' to 5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]-2-methylbenzene-1,3-dicarboxylic acid
Row 1680: Updated 'Unknown' to 5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]-2-hydroxybenzene-1,3-dicarboxylic acid
Row 1681: Updated 'Unknown' to 2-amino-5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]benzene-1,3-dicarboxylic acid
Row 1682: Updated 'Unknown' to 5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]-2-nitrobenzene-1,3-dicarboxylic acid
Row 1683: Updated 'Unknown' to 5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]-2-fluorobenzene-1,3-dicarboxylic acid
Row 1684: Updated 'Unknown' to 5-[4,5-dicarboxy-2-(3,5-dicarboxyphenyl)phenyl]-4-methylbenzene-1,3-dicarboxylic acid
Row 1685: Updated 'Unknown' to 4-(3,5-dicarboxy-2-hydroxyphenyl)-5-(3,5-dicarboxyphenyl)phthalic acid
Row 1686: Updated 'Unknown

Row 1749: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-3-nitrobenzoic acid
Row 1750: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-3-fluorobenzoic acid
Row 1751: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-2-methylbenzoic acid
Row 1752: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-2-hydroxybenzoic acid
Row 1753: Updated 'Unknown' to 2-amino-4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]benzoic acid
Row 1754: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-2-nitrobenzoic acid
Row 1755: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2,5-dimethoxyphenyl]ethynyl]-2-fluorobenzoic acid
Row 1756: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-5-methoxy-3-methyl-2-prop-2-ynoxyphenyl]ethynyl]benzoic acid
Row 1757: Updated 'Unknown' to 4-[2-[4-[2-(4-ca

Row 1809: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]-3-nitrobenzoic acid
Row 1810: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]-3-fluorobenzoic acid
Row 1811: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]-2-methylbenzoic acid
Row 1812: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]-2-hydroxybenzoic acid
Row 1813: Updated 'Unknown' to 2-amino-4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]benzoic acid
Row 1814: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-5-[3-tri(propan-2-yl)silylprop-2-ynoxy]phenyl]ethynyl]-2-nitrobenzoic acid
Row 1815: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-2-methoxy-

Row 1872: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-5-[3-(furan-2-yl)propoxy]-2-methoxyphenyl]ethynyl]-2-hydroxybenzoic acid
Row 1873: Updated 'Unknown' to 2-amino-4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-5-[3-(furan-2-yl)propoxy]-2-methoxyphenyl]ethynyl]benzoic acid
Row 1874: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-5-[3-(furan-2-yl)propoxy]-2-methoxyphenyl]ethynyl]-2-nitrobenzoic acid
Row 1875: Updated 'Unknown' to 4-[2-[4-[2-(4-carboxyphenyl)ethynyl]-5-[3-(furan-2-yl)propoxy]-2-methoxyphenyl]ethynyl]-2-fluorobenzoic acid
Row 1876: Updated 'Unknown' to 4-[2-[10-[2-(4-carboxyphenyl)ethynyl]-2-methylanthracen-9-yl]ethynyl]benzoic acid
Row 1877: Updated 'Unknown' to 4-[2-[10-[2-(4-carboxyphenyl)ethynyl]-3-hydroxyanthracen-9-yl]ethynyl]benzoic acid
Row 1878: Updated 'Unknown' to 4-[2-[2-amino-10-[2-(4-carboxyphenyl)ethynyl]anthracen-9-yl]ethynyl]benzoic acid
Row 1879: Updated 'Unknown' to 4-[2-[10-[2-(4-carboxyphenyl)ethynyl]-3-nitroanthracen-9-yl]ethynyl]ben

Row 1938: Updated 'Unknown' to 4-[2-[7-amino-3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1939: Updated 'Unknown' to 4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]-7-nitropyren-1-yl]ethynyl]benzoic acid
Row 1940: Updated 'Unknown' to 4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]-2-fluoropyren-1-yl]ethynyl]benzoic acid
Row 1941: Updated 'Unknown' to 3-methyl-4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1942: Updated 'Unknown' to 3-hydroxy-4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1943: Updated 'Unknown' to 3-amino-4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1944: Updated 'Unknown' to 3-nitro-4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1945: Updated 'Unknown' to 3-fluoro-4-[2-[3,6,8-tris[2-(4-carboxyphenyl)ethynyl]pyren-1-yl]ethynyl]benzoic acid
Row 1946: Updated 'Unknown' to 2-methyl-4-[2-[3,6,8-tris[2-(4-carboxyphenyl

In [52]:
import pandas as pd
import openpyxl
import selfies as sf

def smiles_to_selfies(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path, engine='openpyxl')

    # Process "Input SMILES (cactus)" column
    for idx, row in df.iterrows():
        smiles_value = row["Input SMILES (cactus)"]
        if smiles_value == "Invalid":
            df.at[idx, "Input SELFIES"] = "Invalid"
        else:
            try:
                selfies_value = sf.encoder(smiles_value)
                df.at[idx, "Input SELFIES"] = selfies_value
            except sf.EncoderError:
                print(f"Error encoding input SMILES at row {idx + 1}: {smiles_value}")
                df.at[idx, "Input SELFIES"] = "Invalid"

    # Process "Output cactus SMILES" column
    for idx, row in df.iterrows():
        smiles_value = row["Output cactus SMILES"]
        if smiles_value == "Invalid":
            df.at[idx, "Output SELFIES"] = "Invalid"
        else:
            try:
                selfies_value = sf.encoder(smiles_value)
                df.at[idx, "Output SELFIES"] = selfies_value
            except sf.EncoderError:
                print(f"Error encoding output SMILES at row {idx + 1}: {smiles_value}")
                df.at[idx, "Output SELFIES"] = "Invalid"


    with pd.ExcelWriter(file_path, engine='openpyxl',mode='a') as writer:
        df.to_excel(writer, sheet_name="SELFIES output", index=False)

    print(f"Updated data saved to {file_path}")

# Run the function with the provided file path
smiles_to_selfies("Method S.xlsx")

Updated data saved to Method S.xlsx


In [53]:
import requests
import pandas as pd

def double_check(file_path):
    # Read the Excel file into a pandas DataFrame using the provided file path.
    df = pd.read_excel(file_path, engine='openpyxl')
    
    # Define the columns to check, the corresponding base URLs, endpoints, and any values to skip.
    columns_to_check = [
        ("Input SMILES (cactus)", "https://cactus.nci.nih.gov/chemical/structure/", "/SMILES", []),
        ("Input IUPAC name (cactus)", "https://cactus.nci.nih.gov/chemical/structure/", "/iupac_name", []),
        ("Output IUPAC name", "https://cactus.nci.nih.gov/chemical/structure/", "/iupac_name", ["Unknown", "Invalid"]),
        ("Output cactus SMILES", "https://cactus.nci.nih.gov/chemical/structure/", "/SMILES", ["Unknown", "Invalid"])
    ]
    
    # Initialize a counter for the number of values checked.
    checked_count = 0
    
    # Iterate over the columns to check.
    for column_name, base_url, endpoint, skip_values in columns_to_check:
        previous_value = None  # Keep track of the previous value to identify duplicates.
        
        # For each column, iterate over each row in the DataFrame.
        for idx, row in df.iterrows():
            input_value = row[column_name]
            
            # If the input value is the same as the previous row, skip checking.
            if input_value == previous_value:
                continue
            
            # Update the previous value tracker.
            previous_value = input_value
            
            # Additional check for "Output IUPAC name" based on "Output Canonical SMILES" column.
            if column_name == "Output IUPAC name" and (row["Output Canonical SMILES"] in ["Unknown", "Invalid"]):
                continue
            
            # Check if the value in the current row and column is NaN or if it's in the list of values to be skipped.
            if pd.isna(input_value) or (skip_values and input_value in skip_values):
                continue

            # Replace any '#' characters with '%23' for URL encoding.
            url_value = input_value.replace('#', '%23')

            # Make an HTTP request to the corresponding URL to validate the url_value.
            response = requests.get(base_url + url_value + endpoint)
            
            # If the response is more than 100 words, truncate it.
            truncated_response = ' '.join(response.text.split()[:100])
            
            # Check conditions for printing error messages
            if "IUPAC" in column_name and response.status_code == 404:
                # Skip printing for IUPAC names with 404 response
                pass
            elif response.status_code != 200 or truncated_response.lower() != input_value.lower():
                print(f"Error in row {idx + 1} with CAS number {row['CAS number']}. Input: {input_value}, Response: {truncated_response}")
            
            # Increment the counter since a value was checked.
            checked_count += 1

    # Print the final number of values checked.
    print(f"{checked_count} values have been checked and done.")




# To run the function, uncomment the following line and provide the path to your Excel file.
double_check("Method S.xlsx")



Error in row 741 with CAS number 586-91-4. Input: 4-[(4-carboxyphenyl)diazenyl]-3-methylbenzoic acid, Response: 4-(4-carboxyphenyl)diazenyl-3-methylbenzoic acid
Error in row 747 with CAS number 586-91-4. Input: 4-[(4-carboxyphenyl)diazenyl]-2-hydroxybenzoic acid, Response: 4-(4-carboxyphenyl)diazenyl-2-hydroxybenzoic acid


ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, '你的主机中的软件中止了一个已建立的连接。', None, 10053, None))

Updated data saved to excel_to_convert_Updated.xlsx


In [62]:
import pandas as pd

import pandas as pd

method_type = "S"  # This can be changed to "S", "I", "P" as needed
file_path = "Method " + method_type + ".xlsx"

# Define the action maps for each method type
action_maps = {
    "S": {
        1: "Introduce or remove a methyl group from the ring.",
        2: "Introduce or remove a hydroxyl group from the ring.",
        3: "Introduce or remove an amino group from the ring.",
        4: "Introduce or remove a nitro group from the ring.",
        5: "Introduce or remove a fluoro group to the ring."
    },
    "I": {
        1: "Introduce or remove a phenyl ring within the linker.",
        2: "Introduce or remove a triple bond within the linker.",
        3: "Introduce or remove a double bond within the linker.",
        4: "Introduce or remove an azo group within the linker."
    },
    "R": {
        1: "Replace a carbon atom in the ring with nitrogen, or vice versa.",
        2: "Replace a carbon atom in the ring with oxygen, or vice versa.",
        3: "Replace a carbon atom in the ring with sulfur, or vice versa."
    },
    "P": {
        1: "Shift the position of COOH within any ring type to another position on the same ring.",
        2: "Relocate the position of N donor, excluding NH, within any ring type to another position on the same ring."
    }
}

# Select the action map based on the method type
action_map = action_maps[method_type]

# Construct the mutation actions section of the prompt
mutation_actions = "\n".join([f"({key}) {value}" for key, value in action_map.items()])

# Define the objectives and mutation issues for each method type
objectives = {
    "S": "introduce new functional groups or alter existing ones to the linker, then provide the correct molecular representation for the modified linker",
    "I": "introduce new functional groups or alter existing ones to the linker, then provide the correct correct molecular representation for the modified linker",
    "R": "swap out atoms in the linker with different heteroatoms (e.g., replace a carbon atom with a nitrogen or sulfur atom), while adhering to general chemical rules and bonding constraints, such as ensuring ring stability and proper valence for atoms, then provide the correct correct molecular representation for the modified linker",
    "P": "change the position of coordination sites, such as COOH or N, within aromatic or non-aromatic rings including 5-membered, 6-membered, 7-membered, and fused rings, then provide the correct correct molecular representation for the modified linker"
}

mutation_issues = {
    "S": "(e.g., it lacks a ring or a suitable substitution site)",
    "I": "(e.g., it lacks a ring or a suitable substitution site)",
    "R": "(e.g., it lacks a ring or a suitable substitution site)",
    "P": "(e.g., it lacks a ring or a suitable position for the coordination site shift)"
}

# Define the common prompt with placeholders
common_prompt = f"""You are an AI assistant with expertise in organic chemistry. Your task is to make theoretical modifications to a given {{desc}} of a MOF linker. {{additional_info}} Your objective is {objectives[method_type]}. You should never remove or modify the carboxylate groups, as they are essential to MOF linkers. The user can choose from {len(action_map)} mutation actions:

{mutation_actions}

The user will first specify the desired mutation action, followed by 'Action: '. In the next line, the user will provide the {{type}} of the MOF linker to be mutated, starting with 'Compound: '.

Your response should begin with 'New Compound: ', followed by the updated {{type}}. If the requested mutation isn't chemically feasible, due to bonding constraints or if the given structure isn't compatible with the mutation {mutation_issues[method_type]}, you should respond with 'New Compound: Invalid'."""

# Specific parts for each type of data
prompts = {
    'smiles': common_prompt.format(
        desc='SMILES code', 
        additional_info='', 
        type='SMILES code'
    ),
    'selfies': common_prompt.format(
        desc='SELFIES string', 
        additional_info='Here SELFIES (SELF-referencIng Embedded Strings) is a string-based representation of molecules. Every SELFIES string corresponds to a valid molecule, similar to the way Canonical SMILES representations work. ',
        type='SELFIES string'
    ),
    'iupac': common_prompt.format(
        desc='IUPAC name', 
        additional_info='', 
        type='IUPAC name'
    )
}





# Function to construct the user message
def construct_user_message(row, data_type):

    
    input_columns_map = {
        'smiles': 'Input SMILES (cactus)',
        'canonical smiles': 'Input Canonical SMILES (PubChem)',
        'selfies': 'Input SELFIES',
        'iupac': 'Input IUPAC name (cactus)'
    }
    
    action = action_map.get(row["Action"], "Invalid Action")
    return f"Action: {action}\nCompound: {row[input_columns_map[data_type]]}"

def construct_assistant_message(row, data_type):
    output_columns_map = {
        'smiles': 'Output cactus SMILES',
        'canonical smiles': 'Output Canonical SMILES',
        'selfies': 'Output SELFIES',
        'iupac': 'Output IUPAC name'
    }
    
    return f"New Compound: {row[output_columns_map[data_type]]}"

# 1. Load the data from the provided Excel file
data = pd.read_excel(file_path)

# For model using SMILES code
data["Modified User Message SMILES"] = data.apply(lambda row: construct_user_message(row, 'smiles'), axis=1)
data["Modified Assistant Message SMILES"] = data.apply(lambda row: construct_assistant_message(row, 'smiles'), axis=1)

# For model using SMILES code
data["Modified User Message Canonical SMILES"] = data.apply(lambda row: construct_user_message(row, 'canonical smiles'), axis=1)
data["Modified Assistant Message Canonical SMILES"] = data.apply(lambda row: construct_assistant_message(row, 'canonical smiles'), axis=1)

# For model using SELFIES string
data["Modified User Message SELFIES"] = data.apply(lambda row: construct_user_message(row, 'selfies'), axis=1)
data["Modified Assistant Message SELFIES"] = data.apply(lambda row: construct_assistant_message(row, 'selfies'), axis=1)

# For model using IUPAC name
data["Modified User Message IUPAC"] = data.apply(lambda row: construct_user_message(row, 'iupac'), axis=1)
data["Modified Assistant Message IUPAC"] = data.apply(lambda row: construct_assistant_message(row, 'iupac'), axis=1)



# 2. Model 1R - Using SMILES code
model_1R = pd.DataFrame({
    "system": [prompts['smiles']] * len(data),
    "user": data["Modified User Message SMILES"],
    "assistant": data["Modified Assistant Message SMILES"]
})
model_1R.to_excel("Model 1"+method_type+".xlsx", index=False)


# 3. Model 2R - Using SELFIES string
model_2R = pd.DataFrame({
    "system": [prompts['selfies']] * len(data),
    "user": data["Modified User Message SELFIES"],
    "assistant": data["Modified Assistant Message SELFIES"]
})
model_2R.to_excel("Model 2"+method_type+".xlsx", index=False)



# 4. Model 3R - Using IUPAC name with filter
filtered_data_3R = data[data["Output IUPAC name"] != "Unknown"]
model_3R = pd.DataFrame({
    "system": [prompts['iupac']] * len(filtered_data_3R),
    "user": filtered_data_3R["Modified User Message IUPAC"],
    "assistant": filtered_data_3R["Modified Assistant Message IUPAC"]
})
model_3R.to_excel("Model 3"+method_type+".xlsx", index=False)



# Mapping for data type alignment with the prompts keys
data_type_map = {
    'SMILES (cactus)': 'smiles',
    'SELFIES': 'selfies',
    'IUPAC name (cactus)': 'iupac'
}

def create_combined_model(data_type_input, data_type_output, model_number):
    # Duplicate the dataframe for reversed roles
    reversed_data = data.copy()
    
    # Swap input and output columns
    reversed_data[f'Input {data_type_input}'], reversed_data[f'Output {data_type_output}'] = reversed_data[f'Output {data_type_output}'], reversed_data[f'Input {data_type_input}']

    # Filter out rows where the input in the reversed data contains "Unknown" or "Invalid"
    reversed_data = reversed_data[~reversed_data[f'Input {data_type_input}'].isin(['Unknown', 'Invalid'])]

    # Constructing the user and assistant messages
    def construct_message(row, input_or_output):
        action = action_map.get(row["Action"], "Invalid Action")
        if input_or_output == "input":
            return f"Action: {action}\nCompound: {row[f'Input {data_type_input}']}"
        else:  # output
            return f"New Compound: {row[f'Output {data_type_output}']}"

    reversed_data[f"Modified User Message {model_number}R"] = reversed_data.apply(lambda row: construct_message(row, "input"), axis=1)
    reversed_data[f"Modified Assistant Message {model_number}R"] = reversed_data.apply(lambda row: construct_message(row, "output"), axis=1)

    # Combine the original and new messages
    combined_data = pd.concat([data, reversed_data], ignore_index=True)

    # Drop duplicates
    combined_data.drop_duplicates(subset=[f'Input {data_type_input}', f'Output {data_type_output}'], inplace=True, ignore_index=True)

    model_df = pd.DataFrame({
        "system": [prompts[data_type_map[data_type_input]]] * len(combined_data),
        "user": combined_data.apply(lambda row: construct_message(row, "input"), axis=1),
        "assistant": combined_data.apply(lambda row: construct_message(row, "output"), axis=1)
    })
    # Filter out rows where "user" column contains "Unknown" or "Invalid"
    model_df = model_df[~model_df['user'].str.contains('Unknown|Invalid')]
    
    model_df.to_excel(f"Model {model_number}"+method_type+".xlsx", index=False)

# Load the data from the provided Excel file
data = pd.read_excel(file_path)

# Generate Model 6R for SMILES
create_combined_model('SMILES (cactus)', 'cactus SMILES',4 )
# Generate Model 7R for SELFIES
create_combined_model('SELFIES', 'SELFIES', 5)
# Generate Model 8R for IUPAC name
create_combined_model('IUPAC name (cactus)', 'IUPAC name', 6)

print ("Training data for Model 1 2 3 4 5 6 " + method_type + " generated.")

Training data for Model 1 2 3 4 5 6 S generated.


'4-[3-(4-carboxyphenyl)-5-[10-[3,5-bis(4-carboxyphenyl)-4-sulfophenyl]anthracen-9-yl]phenyl]benzoic acid'

In [None]:
import pandas as pd
#for Method S only
def SO3_to_5_mutation(filename):
    # Load the data from the given filename
    df = pd.read_excel(filename)

    # Initialize a list to keep track of mutated SMILES
    mutated_smiles = []

    # Iterate over the dataframe rows
    i = 0
    while i < len(df):
        current_smiles = df['Output raw SMILES'].iloc[i]

        # Check for consecutive actions 1 to 5
        if i <= len(df) - 5 and list(df['Action'].iloc[i:i+5]) == [1, 2, 3, 4, 5]:
            # Check if all 5 rows have the same "Output raw SMILES" containing the target string
            if all(df['Output raw SMILES'].iloc[i:i+5] == current_smiles) and "[S](O)(=O)=O" in current_smiles:
                replacements = [
                    ("[CH3]", 1),
                    ("[OH]", 2),
                    ("[NH2]", 3),
                    ("[N+](=O)[O-]", 4),
                    ("[F]", 5)
                ]
                for j, (replacement, action) in enumerate(replacements):
                    # Create the new mutated SMILES
                    mutated = current_smiles.replace("[S](O)(=O)=O", replacement)
                    mutated_smiles.append(mutated)
                    
                    # Check if action matches and print error message if not
                    if df['Action'].iloc[i] != action:
                        print(f"Error at index {i}: Expected action {action} but got {df['Action'].iloc[i]}")
                    i += 1
            else:
                mutated_smiles.append(current_smiles)
                i += 1
        else:
            mutated_smiles.append(current_smiles)
            i += 1

    # Check if "Output SMILES" column already exists
    if 'Output SMILES' in df.columns:
        df['Output SMILES'] = mutated_smiles
    else:
        # Insert the mutated smiles as a new column named "Output SMILES" right after "Output raw SMILES"
        df.insert(df.columns.get_loc('Output raw SMILES') + 1, 'Output SMILES', mutated_smiles)

    # Save the modified dataframe to a new sheet called "output" in the same Excel file
    with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, sheet_name='output', index=False)

    print(f"Data saved to the 'output' sheet in {filename}")
    return 



# Run the function
SO3_to_5_mutation('Method S.xlsx')