In [3]:
import pandas as pd
import numpy as np

meta_cols = ["SMILES", "Tg", "PID", "Polymer Class"]
df = pd.read_csv("../data/intermediate/tg_with_rdkit_descriptors.csv")
y = df["Tg"]
x = df.drop(columns=meta_cols)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

NAN_THRESHOLD = 0.8
RANDOM_SEED = 42
N_ESTIMATORS = 200
VAR_THRESHOLD = 1e-5
CLIP_VALUE = 1e6

nan_ratio = x.isna().mean()
cols_to_keep = nan_ratio[nan_ratio < NAN_THRESHOLD].index
x = x[cols_to_keep]

clipper = FunctionTransformer(lambda X: np.clip(X, -CLIP_VALUE, CLIP_VALUE), 
                              feature_names_out="one-to-one")

baseline_pipeline = Pipeline(
    [("clip", clipper),
     ("impute", SimpleImputer(strategy="median")),
     ("var", VarianceThreshold(threshold=VAR_THRESHOLD)), 
     ("rf", RandomForestRegressor(
         n_estimators=N_ESTIMATORS, 
         random_state=RANDOM_SEED
         ))
    ])

In [6]:
from sklearn.model_selection import KFold

cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [7]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    baseline_pipeline, 
    x, 
    y, 
    scoring="neg_root_mean_squared_error", 
    cv=cv, 
    n_jobs=-1
)

rmse_scores = -scores
rmse_scores

array([40.2683733 , 41.03864849, 45.24712788, 39.34952534, 38.69689903])

In [10]:
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

print(f"CV RMSE = {rmse_mean:.2f} ± {rmse_std:.2f}")
print(f"{rmse_std / rmse_mean * 100:.2f}% variation between folds")

CV RMSE = 40.92 ± 2.31
5.63% variation between folds


### Interpretation

The baseline Random Forest model demonstrates stable performance
across 5 cross-validation folds, with a relative RMSE variation
of 5.63%. This indicates that the baseline model is robust to
different train-validation splits and provides a reliable
reference for evaluating polymer-aware feature augmentation.
