In [None]:
import pandas as pd 

In [None]:
from rdkit import Chem

# Step 2 — load dataset
df = pd.read_csv("./data/AqSolDB.csv")
df.head()


In [None]:
from rdkit.Chem.rdmolops import SanitizeMol

In [None]:
valid_smiles = []
valid_logs = []

In [None]:
for i, row in df.iterrows():
    smi = row['SMILES']
    logS = row['LogS']
    mol = Chem.MolFromSmiles(smi)
    
    if mol is not None:
        try:
            # Sanitize molecule (check valence, aromaticity, conjugation, hybridization)
            SanitizeMol(mol)
            
            # Canonicalize SMILES to have consistent representation
            clean_smi = Chem.MolToSmiles(mol, canonical=True)
            
            valid_smiles.append(clean_smi)
            valid_logs.append(logS)
        
        except Exception as e:
            # Skip molecules that fail sanitization
            continue

# Create a cleaned DataFrame
clean_df = pd.DataFrame({'SMILES': valid_smiles, 'LogS': valid_logs})

print(f"✅ Valid molecules after sanitization: {len(clean_df)} / {len(df)}")
clean_df.head()

In [None]:
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize_molecule(mol):
    try:
        # Remove salts and fragments
        cleaner = rdMolStandardize.Cleanup(mol)
        # Normalize functional groups
        normalizer = rdMolStandardize.Normalize(cleaner)
        # Reionize to standard charge states
        reionizer = rdMolStandardize.Reionize(normalizer)
        return reionizer
    except:
        return mol  # return original if fails




In [None]:
standardized_smiles = []

for smi in df['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        mol = standardize_molecule(mol)
        clean_smi = Chem.MolToSmiles(mol, canonical=True)
        standardized_smiles.append(clean_smi)
    else:
        standardized_smiles.append(None)

df['Standardized_SMILES'] = standardized_smiles
df = df.dropna(subset=['Standardized_SMILES']).reset_index(drop=True)

df.to_csv("standardized_AqSolDB.csv", index=False)
print(f"✅ Standardized dataset saved with {len(df)} molecules")
df.head()