In [16]:
import pandas as pd
import numpy as np

meta_cols = ["SMILES", "Tg", "PID", "Polymer Class"]
df = pd.read_csv("../data/intermediate/tg_with_rdkit_descriptors.csv")
y = df["Tg"]
x = df.drop(columns=meta_cols)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

NAN_THRESHOLD = 0.8
RANDOM_SEED = 42
N_ESTIMATORS = 200
VAR_THRESHOLD = 1e-5
CLIP_VALUE = 1e6

nan_ratio = x.isna().mean()
cols_to_keep = nan_ratio[nan_ratio < NAN_THRESHOLD].index
x = x[cols_to_keep]

clipper = FunctionTransformer(lambda X: np.clip(X, -CLIP_VALUE, CLIP_VALUE), 
                              feature_names_out="one-to-one")

baseline_pipeline = Pipeline(
    [("clip", clipper),
     ("impute", SimpleImputer(strategy="median")),
     ("var", VarianceThreshold(threshold=VAR_THRESHOLD)), 
     ("rf", RandomForestRegressor(
         n_estimators=N_ESTIMATORS, 
         random_state=RANDOM_SEED
         ))
    ])

In [18]:
from sklearn.model_selection import KFold

cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    baseline_pipeline, 
    x, 
    y, 
    scoring="neg_root_mean_squared_error", 
    cv=cv, 
    n_jobs=-1
)

rmse_scores = -scores
rmse_scores

array([40.2683733 , 41.03864849, 45.24712788, 39.34952534, 38.69689903])

In [20]:
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

print(f"CV RMSE = {rmse_mean:.2f} ± {rmse_std:.2f}")
print(f"{rmse_std / rmse_mean * 100:.2f}% variation between folds")

CV RMSE = 40.92 ± 2.31
5.63% variation between folds


### Baseline Cross Validation Interpretation

The baseline Random Forest model demonstrates stable performance
across 5 cross-validation folds, with a relative RMSE variation
of 5.63%. This indicates that the baseline model is robust to
different train-validation splits and provides a reliable
reference for evaluating polymer-aware feature augmentation.


# Cross validation for baseline with polymer aware features

In [31]:
df_polymer = pd.read_csv("../data/intermediate/tg_with_rdkit_and_polymer_features.csv")
x_polymer = df_polymer.drop(columns=meta_cols)
y_polymer = df_polymer["Tg"]

In [32]:
scores = cross_val_score(
    baseline_pipeline, 
    x_polymer, 
    y_polymer, 
    scoring="neg_root_mean_squared_error", 
    cv=cv, 
    n_jobs=-1
)

rmse_scores_polymer = -scores
rmse_scores_polymer

array([39.57838949, 41.00775447, 44.89479074, 38.48272628, 40.03640189])

In [None]:
rmse_mean_polymer = rmse_scores_polymer.mean()
rmse_std_polymer = rmse_scores_polymer.std()

print(f"CV RMSE with polymer features = {rmse_mean_polymer:.2f} ± {rmse_std_polymer:.2f}")
print(f"{rmse_std_polymer / rmse_mean_polymer * 100:.2f}% variation between folds with polymer features")

CV RMSE with polymer features = 40.80 ± 2.20
5.40% variation between folds with polymer features


Cross-validation results indicate that incorporating polymer-aware features leads to a marginal reduction in average RMSE (40.80 vs. 40.92) and slightly improved stability across folds. However, the observed improvement is substantially smaller than the baseline cross-validation uncertainty, suggesting that the added polymer-aware descriptors do not provide a statistically meaningful performance gain under the current modeling setup. This result implies that the baseline RDKit descriptors already capture the dominant structure–property relationships for Tg, and that further gains may require either more expressive polymer-specific features or models better suited to exploit them.

In [33]:
descriptor_names = df.columns.drop(meta_cols)
x_polymer_only = x_polymer.drop(columns=descriptor_names)
y_polymer_only = df_polymer["Tg"]

In [34]:
scores_polymer_only = cross_val_score(
    baseline_pipeline, 
    x_polymer_only, 
    y_polymer_only, 
    scoring="neg_root_mean_squared_error", 
    cv=cv, 
    n_jobs=-1
)

rmse_scores_polymer_only = -scores_polymer_only
rmse_scores_polymer_only

array([48.30757105, 48.2356042 , 52.66852885, 45.41492047, 46.48558806])

In [35]:
rmse_mean_polymer_only = rmse_scores_polymer_only.mean()
rmse_std_polymer_only = rmse_scores_polymer_only.std()

print(f"CV RMSE with polymer features = {rmse_mean_polymer_only:.2f} ± {rmse_std_polymer_only:.2f}")
print(f"{rmse_std_polymer_only / rmse_mean_polymer_only * 100:.2f}% variation between folds with polymer features")

CV RMSE with polymer features = 48.22 ± 2.48
5.14% variation between folds with polymer features


Cross-validation using polymer-aware features alone resulted in a substantially higher RMSE compared to the RDKit baseline, indicating that these descriptors do not capture the dominant structure–property relationships governing Tg. When combined with RDKit descriptors, polymer-aware features led to a marginal reduction in average RMSE and slightly improved stability across folds, but the improvement was well within the baseline cross-validation uncertainty. These results suggest that while polymer-aware features provide complementary structural context, their predictive signal is relatively weak compared to conventional molecular descriptors under the current modeling setup.