<a href="https://colab.research.google.com/github/ubaidillah-chem/fouling-ml/blob/main/00_solubility_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-optimize
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load data
data = pd.read_csv("gdrive/MyDrive/Solubility.csv")

# Add jitter noise to MW and NBP (Simulating experimental variability)
np.random.seed(123)
data['MW'] += np.random.normal(0, 0.5, size=len(data))
data['NBP'] += np.random.normal(0, 0.5, size=len(data))

# Create matrix and target
X = data.drop(columns=['O2solubility'])
y = data['O2solubility']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# Define hyperparameter search space
space = [
    Real(0.01, 0.3, name='eta'),
    Integer(3, 10, name='max_depth'),
    Real(0.5, 1.0, name='subsample'),
    Real(0.5, 1.0, name='colsample_bytree'),
]

# Objective function for Bayesian Optimization
@use_named_args(space)
def objective(**params):
    model = XGBRegressor(
        objective='reg:squarederror',
        eval_metric='rmse',
        n_estimators=1000,
        early_stopping_rounds=10,
        verbosity=0,
        random_state=123,
        **params
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False
    )
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# Run Bayesian Optimization
result = gp_minimize(
    objective,
    space,
    n_calls=30,
    random_state=123
)

print(f"Best RMSE: {result.fun:.4f}")
print(f"Best parameters: {result.x}")

# Train final model with best parameters
best_params = {
    'eta': result.x[0],
    'max_depth': result.x[1],
    'subsample': result.x[2],
    'colsample_bytree': result.x[3]
}

final_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=1000,
    early_stopping_rounds=10,
    verbosity=1,
    random_state=123,
    **best_params
)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=100
)

# Evaluate
preds = final_model.predict(X_test)
rmse_final = np.sqrt(mean_squared_error(y_test, preds))
print(f"Final Test RMSE: {rmse_final:.4f}")
