## Validate SMILES format using RDkit

This step validates whether SMILES strings can be parsed by RDKit. 
Invalid SMILES are filtered out. No SMILES correction is performed.

In [1]:
from rdkit import Chem
import pandas as pd

In [8]:
df = pd.read_csv('../data/raw/Tg_dataset.csv')
df.head()

Unnamed: 0,SMILES,Tg,PID,Polymer Class
0,*C*,-54.0,P010001,Polyolefins
1,*CC(*)C,-3.0,P010002,Polyolefins
2,*CC(*)CC,-24.1,P010003,Polyolefins
3,*CC(*)CCC,-37.0,P010004,Polyolefins
4,*CC(*)C(C)C,60.0,P010006,Polyolefins


In [9]:
def is_valid_smiles(smiles):
    if pd.isna(smiles):
        return False
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

In [10]:
df["is_valid_smiles"] = df["SMILES"].apply(is_valid_smiles)

In [11]:
valid_count = df["is_valid_smiles"].sum()
invalid_count = len(df) - valid_count

print(f"Valid SMILES: {valid_count}")
print(f"Invalid SMILES: {invalid_count}")

Valid SMILES: 7284
Invalid SMILES: 0


In [12]:
df_valid = df[df["is_valid_smiles"]].copy()
df_valid.shape

(7284, 5)

In [13]:
df_valid = df_valid.drop(columns=["is_valid_smiles"])
df_valid.to_csv('../data/processed/Tg_dataset_valid_smiles.csv', index=False)