# Training Models 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    StackingRegressor
)
from xgboost import XGBRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LinearRegression


In [2]:
#Loading data
df = pd.read_csv("../data/processed/qm9_mordred_clean.csv")

In [7]:
#Define Features and Targets
targets = ["mu", "HOMO", "LUMO", "gap"]
X = df.drop(columns=["smiles"] + targets)

#### Train Ridge Regression for each target

In [None]:
metrics = []
for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Ridge()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    metrics.append({
        "target": target,
        "model": "Ridge",
        "r2_score": r2,
        "mse": mse
    })

# --- Save metrics to CSV ---
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../results/metrics/ridge_metrics.csv", index=False)

In [12]:
print("\nRidge Regression Performance Metrics:\n")
print(metrics_df.to_string(index=False, float_format="%.4f"))


Ridge Regression Performance Metrics:

target model  r2_score    mse
    mu Ridge    0.3406 1.4616
  HOMO Ridge    0.5506 0.0002
  LUMO Ridge    0.8320 0.0004
   gap Ridge    0.7829 0.0005


#### Train Random Forest Regression on each target

In [14]:
metrics = []
for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    metrics.append({
        "target": target,
        "model": "RandomForest",
        "r2_score": r2,
        "mse": mse
    })

# Save to results/metrics
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../results/metrics/random_forest_metrics.csv", index=False)

print("\nRandom Forest Regression Performance Metrics:\n")
print(metrics_df.to_string(index=False, float_format="%.4f"))



Random Forest Regression Performance Metrics:

target        model  r2_score    mse
    mu RandomForest    0.6238 0.8338
  HOMO RandomForest    0.8517 0.0001
  LUMO RandomForest    0.9620 0.0001
   gap RandomForest    0.9443 0.0001


#### Train Random Forest Regression on each target

In [19]:
metrics = []
for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    metrics.append({
        "target": target,
        "model": "XGBoost",
        "r2_score": r2,
        "mse": mse
    })

# Save to results/metrics 
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../results/metrics/xgboost_metrics.csv", index=False)

print("\nXGBoost Regression Performance Metrics:\n")
print(metrics_df.to_string(index=False, float_format="%.4f"))


XGBoost Regression Performance Metrics:

target   model  r2_score    mse
    mu XGBoost    0.5884 0.9123
  HOMO XGBoost    0.8363 0.0001
  LUMO XGBoost    0.9525 0.0001
   gap XGBoost    0.9283 0.0002


#### Training NN regression model

In [26]:
# NN regression models
from tensorflow.keras.regularizers import l2
def train_nn_models(df,
                    targets=None,
                    epochs=100,
                    batch_size=256,
                    patience=7,
                    l2_reg=1e-6,
                    verbose=1,
                    seed=42):
    """
    Train separate NN regression models for each specified target.
    Metrics are printed and returned as a DataFrame.
    Optionally saves each model.
    """

    tf.random.set_seed(seed)
    np.random.seed(seed)

    #excluding identifiers and targets
    excluded = ["smiles"] + targets
    features = [col for col in df.columns if col not in excluded]

    X = df[features].values
    metrics = []

    for target in targets:
        y = df[target].values.reshape(-1, 1)

        # Split
        X_train, X_test, y_train_raw, y_test_raw = train_test_split(
            X, y, test_size=0.18, random_state=seed
        )

        # Scale
        X_scaler = StandardScaler()
        y_scaler = StandardScaler()

        X_train_scaled = X_scaler.fit_transform(X_train)
        X_test_scaled = X_scaler.transform(X_test)

        y_train = y_scaler.fit_transform(y_train_raw)
        y_test = y_scaler.transform(y_test_raw)

        # Define model
        model = Sequential([
            Input(shape=(X_train_scaled.shape[1],)),
            Dense(512, activation='relu', kernel_regularizer=l2(l2_reg)),
            Dropout(0.3),
            Dense(256, activation='relu', kernel_regularizer=l2(l2_reg)),
            Dropout(0.3),
            Dense(128, activation='relu', kernel_regularizer=l2(l2_reg)),
            Dropout(0.2),
            Dense(64, activation='relu', kernel_regularizer=l2(l2_reg)),
            Dense(1)
        ])

        model.compile(optimizer='adam', loss='mse', metrics=['mae'])

        # Early stopping
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=patience,
                restore_best_weights=True,
                verbose=verbose
            )
        ]

        # Train
        history = model.fit(
            X_train_scaled, y_train,
            validation_data=(X_test_scaled, y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=verbose
        )

        # Predict
        y_pred_scaled = model.predict(X_test_scaled, verbose=0)
        y_pred = y_scaler.inverse_transform(y_pred_scaled)

        # Evaluate
        r2 = r2_score(y_test_raw, y_pred)
        mse = mean_squared_error(y_test_raw, y_pred)


        print(f"\n{target.upper()} Evaluation:")
        print(f"  MSE:       {mse:.5f}")
        print(f"  R²:        {r2:.5f}")

        # Save metrics
        metrics.append({
            "target": target,
            "model": "TensorFlow_NN",
            "MSE": mse,
            "R2": r2
        })

    # Print summary
    metrics_df = pd.DataFrame(metrics)
    print("\nSummary of all models:\n")
    print(metrics_df.to_string(index=False, float_format="%.5f"))

    return metrics_df

In [27]:
metrics_df = train_nn_models(df, targets=["mu", "gap","LUMO", "HOMO"])
metrics_df.to_csv("../results/metrics/tf_nn_metrics.csv", index=False)

Epoch 1/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.6715 - mae: 0.5992 - val_loss: 0.5114 - val_mae: 0.5265
Epoch 2/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.5117 - mae: 0.5227 - val_loss: 0.4634 - val_mae: 0.5016
Epoch 3/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.4827 - mae: 0.5042 - val_loss: 0.4471 - val_mae: 0.4918
Epoch 4/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.4608 - mae: 0.4926 - val_loss: 0.4399 - val_mae: 0.4887
Epoch 5/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.4567 - mae: 0.4856 - val_loss: 0.4346 - val_mae: 0.4874
Epoch 6/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.4420 - mae: 0.4806 - val_loss: 0.4169 - val_mae: 0.4766
Epoch 7/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/

#### Train SVM for each target 

In [30]:
# Feature scaling
X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(X)

metrics = []
for target in targets:
    y = df[target].values.reshape(-1, 1)
    y_scaler = StandardScaler()
    y_scaled = y_scaler.fit_transform(y).ravel()  # flatten for SVR

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # Model
    model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    model.fit(X_train, y_train)
    y_pred_scaled = model.predict(X_test)

    # Inverse scale predictions
    y_test_unscaled = y_scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()
    y_pred_unscaled = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

    # Evaluate
    mse = mean_squared_error(y_test_unscaled, y_pred_unscaled)
    r2 = r2_score(y_test_unscaled, y_pred_unscaled)

    print(f"\n{target.upper()} SVM Evaluation:")
    print(f"  MSE:       {mse:.5f}")
    print(f"  R²:        {r2:.5f}")

    metrics.append({
        "target": target,
        "model": "SVM",
        "MSE": mse,
        "R2": r2
    })

# Save Metrics 
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../results/metrics/svm_metrics.csv", index=False)

print("\nSVM Regression Summary:\n")
print(metrics_df.to_string(index=False, float_format="%.5f"))


MU SVM Evaluation:
  MSE:       1.03210
  R²:        0.53437

HOMO SVM Evaluation:
  MSE:       0.00008
  R²:        0.84151

LUMO SVM Evaluation:
  MSE:       0.00010
  R²:        0.95512

GAP SVM Evaluation:
  MSE:       0.00016
  R²:        0.92630

SVM Regression Summary:

target model     MSE      R2
    mu   SVM 1.03210 0.53437
  HOMO   SVM 0.00008 0.84151
  LUMO   SVM 0.00010 0.95512
   gap   SVM 0.00016 0.92630


#### Stacked Ensemble (Ridge + RF + XGB + SVM + CatBoost + LGBM + GB → LR)

In [32]:
metrics = []
# Train for each target 
for target in targets:
    y = df[target].values

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    #Base learners
    base_learners = [
        ("ridge", Ridge(alpha=1.0)),
        ("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
        ("xgb", XGBRegressor(n_estimators=100, random_state=42, verbosity=0, n_jobs=-1)),
        ("svm", SVR(kernel="rbf", C=1.0, epsilon=0.1)),
        ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ("cat", CatBoostRegressor(verbose=0, random_seed=42)),
        ("lgbm", LGBMRegressor(n_estimators=100, random_state=42))
    ]

    # Meta-learner
    meta_model = LinearRegression()

    # Stacking model
    stack = StackingRegressor(
        estimators=base_learners,
        final_estimator=meta_model,
        passthrough=True,
        n_jobs=-1
    )

    # Pipeline
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("stacked", stack)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{target.upper()} - Stacked Model Evaluation:")
    print(f"  MSE:       {mse:.5f}")
    print(f"  R²:        {r2:.5f}")

    # Save metrics
    metrics.append({
        "target": target,
        "model": "StackedRegressor",
        "MSE": mse,
        "R2": r2
    })

# Save metrics to CSV
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv("../results/metrics/stacked_model_metrics.csv", index=False)

print("\nStacked Model Summary:\n")
print(metrics_df.to_string(index=False, float_format="%.5f"))





MU - Stacked Model Evaluation:
  MSE:       0.81134
  R²:        0.63397





HOMO - Stacked Model Evaluation:
  MSE:       0.00006
  R²:        0.87317





LUMO - Stacked Model Evaluation:
  MSE:       0.00008
  R²:        0.96566

GAP - Stacked Model Evaluation:
  MSE:       0.00011
  R²:        0.94874

Stacked Model Summary:

target            model     MSE      R2
    mu StackedRegressor 0.81134 0.63397
  HOMO StackedRegressor 0.00006 0.87317
  LUMO StackedRegressor 0.00008 0.96566
   gap StackedRegressor 0.00011 0.94874




### Using a stacked approach that first learns HOMO, LUMO, and gap and then uses these predictions to predict mu

In [None]:
# STAGE 1: Predict HOMO and LUMO from descriptors
def train_and_predict_qm_props(df, targets=['HOMO', 'LUMO'], seed=42, results_path="../results/metrics"):
    """
    Train Random Forest models to predict HOMO and LUMO from descriptors.
    Saves predictions for train/test and logs metrics for each target.
    """
    np.random.seed(seed)

    # Define input features
    features = [col for col in df.columns if col not in ['smiles'] + targets + ['mu', 'gap']]
    X = df[features].values

    # Split features and full DF (for retaining original labels)
    X_train, X_test, df_train, df_test = train_test_split(
        X, df.copy(), test_size=0.18, random_state=seed
    )

    # Scale features
    X_scaler = StandardScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # DataFrames to hold predictions
    pred_train_df = df_train.copy()
    pred_test_df = df_test.copy()

    # Track metrics
    metrics = []

    for target in targets:
        y_train = df_train[target].values
        y_test = df_test[target].values

        model = RandomForestRegressor(random_state=seed, n_jobs=-1)
        model.fit(X_train_scaled, y_train)

        # Predictions
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)

        pred_train_df[f'pred_{target}'] = y_pred_train
        pred_test_df[f'pred_{target}'] = y_pred_test

        # Evaluate
        r2 = r2_score(y_test, y_pred_test)
        mse = mean_squared_error(y_test, y_pred_test)
        metrics.append({
            "target": target,
            "model": "RandomForest",
            "r2_score": r2,
            "mse": mse
        })

    # Save metrics
    metrics_df = pd.DataFrame(metrics)
    print("\nRandom Forest Regression Performance Metrics:\n")
    print(metrics_df.to_string(index=False, float_format="%.4f"))

    return pred_train_df, pred_test_df, X_scaler


In [43]:
pred_train_df, pred_test_df, _ = train_and_predict_qm_props(df, targets=['HOMO', 'LUMO', 'gap'])


Random Forest Regression Performance Metrics:

target        model  r2_score    mse
  HOMO RandomForest    0.8529 0.0001
  LUMO RandomForest    0.9626 0.0001
   gap RandomForest    0.9452 0.0001


In [44]:
# STAGE 2: Predict dipole moment using predictions from Stage 1
def train_dipole_with_qm_preds(pred_train_df, pred_test_df, seed=42):
    """
    Train model to predict dipole moment using original features + predicted HOMO, LUMO
    """
    # Targets and Features
    used_features = [col for col in pred_train_df.columns if col.startswith('pred_') or col not in ['smiles', 'mu', 'HOMO', 'LUMO', 'gap']]
    
    X_train = pred_train_df[used_features].values
    y_train = pred_train_df['mu'].values

    X_test = pred_test_df[used_features].values
    y_test = pred_test_df['mu'].values

    # Optional: Normalize (could experiment with raw vs. scaled)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = GradientBoostingRegressor(random_state=seed)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    print("\nDIPOLE MOMENT PREDICTION:")
    print(f"  MSE: {mean_squared_error(y_test, y_pred):.5f}")
    print(f"  R²:  {r2_score(y_test, y_pred):.5f}")

    return model


In [47]:
mu_model = train_dipole_with_qm_preds(pred_train_df, pred_test_df)


DIPOLE MOMENT PREDICTION:
  MSE: 1.16975
  R²:  0.47147
