In [2]:
import pandas as pd 

In [1]:
from rdkit import Chem

# Step 2 — load dataset
df = pd.read_csv("AqSolDB.csv")
df.head()


ModuleNotFoundError: No module named 'rdkit'

In [14]:
from rdkit.Chem.rdmolops import SanitizeMol

In [8]:
valid_smiles = []
valid_logs = []

In [9]:
for i, row in df.iterrows():
    smi = row['SMILES']
    logS = row['LogS']
    mol = Chem.MolFromSmiles(smi)
    
    if mol is not None:
        try:
            # Sanitize molecule (check valence, aromaticity, conjugation, hybridization)
            SanitizeMol(mol)
            
            # Canonicalize SMILES to have consistent representation
            clean_smi = Chem.MolToSmiles(mol, canonical=True)
            
            valid_smiles.append(clean_smi)
            valid_logs.append(logS)
        
        except Exception as e:
            # Skip molecules that fail sanitization
            continue

# Create a cleaned DataFrame
clean_df = pd.DataFrame({'SMILES': valid_smiles, 'LogS': valid_logs})

print(f"✅ Valid molecules after sanitization: {len(clean_df)} / {len(df)}")
clean_df.head()

[16:05:54] Explicit valence for atom # 5 N, 4, is greater than permitted
[16:05:54] Explicit valence for atom # 5 N, 4, is greater than permitted


✅ Valid molecules after sanitization: 9980 / 9982


Unnamed: 0,SMILES,LogS
0,CCCCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],-3.616127
1,O=C1Nc2cccc3cccc1c23,-3.254767
2,O=Cc1ccc(Cl)cc1,-2.177078
3,CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)...,-3.924409
4,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...,-4.662065


In [15]:
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize_molecule(mol):
    try:
        # Remove salts and fragments
        cleaner = rdMolStandardize.Cleanup(mol)
        # Normalize functional groups
        normalizer = rdMolStandardize.Normalize(cleaner)
        # Reionize to standard charge states
        reionizer = rdMolStandardize.Reionize(normalizer)
        return reionizer
    except:
        return mol  # return original if fails




In [17]:
standardized_smiles = []

for smi in df['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        mol = standardize_molecule(mol)
        clean_smi = Chem.MolToSmiles(mol, canonical=True)
        standardized_smiles.append(clean_smi)
    else:
        standardized_smiles.append(None)

df['Standardized_SMILES'] = standardized_smiles
df = df.dropna(subset=['Standardized_SMILES']).reset_index(drop=True)

df.to_csv("standardized_AqSolDB.csv", index=False)
print(f"✅ Standardized dataset saved with {len(df)} molecules")
df.head()

[16:42:36] Initializing MetalDisconnector
[16:42:36] Running MetalDisconnector
[16:42:36] Initializing Normalizer
[16:42:36] Running Normalizer
[16:42:36] Initializing Normalizer
[16:42:36] Running Normalizer
[16:42:37] Initializing MetalDisconnector
[16:42:37] Running MetalDisconnector
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing MetalDisconnector
[16:42:37] Running MetalDisconnector
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing MetalDisconnector
[16:42:37] Running MetalDisconnector
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializing MetalDisconnector
[16:42:37] Running MetalDisconnector
[16:42:37] Initializing Normalizer
[16:42:37] Running Normalizer
[16:42:37] Initializ

✅ Standardized dataset saved with 9980 molecules


[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing MetalDisconnector
[16:43:04] Running MetalDisconnector
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing MetalDisconnector
[16:43:04] Running MetalDisconnector
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing MetalDisconnector
[16:43:04] Running MetalDisconnector
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing MetalDisconnector
[16:43:04] Running MetalDisconnector
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing Normalizer
[16:43:04] Running Normalizer
[16:43:04] Initializing MetalDisconnector
[16:43:04] Running MetalDisconnector
[16:43:04] Initializ

Unnamed: 0.1,Unnamed: 0,SMILES,LogS,Standardized_SMILES
0,0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,CCCCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-]
1,1,O=C1Nc2cccc3cccc1c23,-3.254767,O=C1Nc2cccc3cccc1c23
2,2,Clc1ccc(C=O)cc1,-2.177078,O=Cc1ccc(Cl)cc1
3,3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)...
4,4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...
