## Generate RDKit molecular descriptors from SMILES

This notebook generates RDKit molecular descriptors from validated SMILES.

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from tqdm import tqdm

In [21]:
df = pd.read_csv("../data/processed/Tg_dataset_valid_smiles.csv")
df.shape
# df.head()

(7284, 4)

In [None]:
descriptor_names = [desc_name[0] for desc_name in Descriptors._descList]
# len(descriptor_names)
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

In [7]:
sample_df = df.sample(20, random_state=42)

In [None]:
def calc_descriptors(smiles: str):
    """Calculate RDKit descriptors for a given SMILES string.
    Returns a list of descriptor values.
    If the SMILES is invalid, returns a list of None values.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(descriptor_names)
    return calculator.CalcDescriptors(mol)

In [18]:
sample_desc = sample_df["SMILES"].apply(calc_descriptors)
sample_desc_df = pd.DataFrame(sample_desc.tolist(), columns=descriptor_names)
sample_desc_df.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.000076,10.000076,0.206217,-0.77399,0.651524,21.533333,207.317,186.149,207.162314,84,...,0,0,0,0,0,0,0,0,0,0
1,6.048317,6.048317,0.147202,-0.16539,0.678449,11.65,258.32,244.208,258.104465,96,...,0,0,0,0,0,0,0,0,0,0
2,12.6831,12.6831,0.015103,-1.066019,0.047847,18.960784,722.992,664.528,722.38501,280,...,0,0,0,0,0,0,0,0,8,0
3,13.411777,13.411777,0.094773,-3.819114,0.11892,11.375,760.824,728.568,760.187937,276,...,0,0,1,0,0,0,0,0,0,0
4,12.437778,12.437778,0.007741,-0.925108,0.408372,10.923077,354.314,340.202,354.073953,132,...,0,0,0,0,0,0,0,0,0,0


In [19]:
tqdm.pandas()

desc_series = df["SMILES"].progress_apply(calc_descriptors)
desc_df = pd.DataFrame(desc_series.tolist(), columns=descriptor_names)


100%|██████████| 7284/7284 [02:46<00:00, 43.81it/s] 


In [22]:
df_desc = pd.concat(
    [df.reset_index(drop=True), desc_df.reset_index(drop=True)],
    axis=1
)

df_desc.shape
df_desc.head(10)

Unnamed: 0,SMILES,Tg,PID,Polymer Class,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,*C*,-54.0,P010001,Polyolefins,1.75,1.75,0.875,0.875,0.355446,20.0,...,0,0,0,0,0,0,0,0,0,0
1,*CC(*)C,-3.0,P010002,Polyolefins,2.395833,2.395833,0.75,0.75,0.41472,25.666667,...,0,0,0,0,0,0,0,0,0,0
2,*CC(*)CC,-24.1,P010003,Polyolefins,2.332824,2.332824,0.743056,0.743056,0.451401,22.25,...,0,0,0,0,0,0,0,0,0,0
3,*CC(*)CCC,-37.0,P010004,Polyolefins,2.31,2.31,0.734306,0.734306,0.476641,20.2,...,0,0,0,0,0,0,0,0,0,0
4,*CC(*)C(C)C,60.0,P010006,Polyolefins,2.374491,2.374491,0.699074,0.699074,0.465496,21.4,...,0,0,0,0,0,0,0,0,0,0
5,*CC(*)CCCC,-50.0,P010007,Polyolefins,2.300547,2.300547,0.726528,0.726528,0.490565,18.833333,...,0,0,0,0,0,0,0,0,1,0
6,*CC(*)CC(C)C,35.2,P010008,Polyolefins,2.351667,2.351667,0.704722,0.704722,0.492023,19.833333,...,0,0,0,0,0,0,0,0,0,0
7,*CC(*)CCCCC,-45.5,P010009,Polyolefins,2.296561,2.296561,0.720009,0.720009,0.463684,17.857143,...,0,0,0,0,0,0,0,0,2,0
8,*CC(*)CCCCCCC,-47.0,P010012,Polyolefins,2.294834,2.294834,0.710054,0.710054,0.45663,16.555556,...,0,0,0,0,0,0,0,0,4,0
9,*CC(*)CCCCCCCC,-35.0,P010013,Polyolefins,2.295206,2.295206,0.706227,0.706227,0.44622,16.1,...,0,0,0,0,0,0,0,0,5,0


In [23]:
df_desc.isna().sum().sort_values(ascending=False).head(10)

BCUT2D_MWLOW           7284
BCUT2D_LOGPLOW         7284
BCUT2D_LOGPHI          7284
BCUT2D_MRHI            7284
BCUT2D_CHGLO           7284
BCUT2D_CHGHI           7284
BCUT2D_MWHI            7284
BCUT2D_MRLOW           7284
MinPartialCharge       6909
MinAbsPartialCharge    6909
dtype: int64

In [24]:
nan_ratio = df_desc.isna().mean()
nan_ratio.sort_values(ascending=False).head(20)


BCUT2D_MWLOW           1.000000
BCUT2D_LOGPLOW         1.000000
BCUT2D_LOGPHI          1.000000
BCUT2D_MRHI            1.000000
BCUT2D_CHGLO           1.000000
BCUT2D_CHGHI           1.000000
BCUT2D_MWHI            1.000000
BCUT2D_MRLOW           1.000000
MinPartialCharge       0.948517
MinAbsPartialCharge    0.948517
MaxPartialCharge       0.948517
MaxAbsPartialCharge    0.948517
Tg                     0.000000
ExactMolWt             0.000000
HeavyAtomMolWt         0.000000
MolWt                  0.000000
SPS                    0.000000
qed                    0.000000
MinEStateIndex         0.000000
MinAbsEStateIndex      0.000000
dtype: float64

In [25]:
df_desc.to_csv("../data/processed/tg_with_rdkit_descriptors.csv", index=False)


### Handle descriptor with NaN

Drop columns with more than 80% missing data

In [27]:
NAN_THRESHOLD = 0.8
drop_cols = nan_ratio[nan_ratio > NAN_THRESHOLD].index
len(drop_cols)

12

In [29]:
df_desc_step1 = df_desc.drop(columns=drop_cols)

Remaining NaNs are filled with column median

In [31]:
meta_cols = ["SMILES", "Tg", "PID", "Polymer Class"]
descriptor_cols = [c for c in df_desc_step1.columns if c not in meta_cols]

df_desc_clean = df_desc_step1.copy()

df_desc_clean[descriptor_cols] = (
    df_desc_clean[descriptor_cols]
    .fillna(df_desc_clean[descriptor_cols].median())
)

In [34]:
df_desc_clean.isna().sum().sum()

np.int64(0)

In [36]:
df_desc_clean.to_csv("../data/processed/tg_with_rdkit_descriptors_cleaned.csv", index=False)

### Feature Cleaning
- Removed molecular descriptors with more than 80% missing values.
- Remaining missing values were imputed using column-wise median.
- This resulted in a clean descriptor table suitable for baseline modeling.
