In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pickle

# model imports
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor, LGBMClassifier 
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import optuna

In [2]:
import sklearn
print(sklearn.__version__)

1.7.1


In [19]:
class RegressionTest:

    def __init__(self, X, y, rand_state):
        """initialise class and load data as pandas dataframes"""
        self.X = X
        self.y = y
        self.rand_state = rand_state

    def test_models(self, scoring):
        """run untuned models against one another to assess performance, perfomance reported as MAE and r2"""

        # dictionary containing all the models:

        models = {
            "DECISION TREE REGRESSOR":(DecisionTreeRegressor(random_state=(int(self.rand_state)))),
            "RANDOM FOREST REGRESSOR":(RandomForestRegressor(random_state=(int(self.rand_state)),verbose=0)),
            "XGB REGRESSOR":(XGBRegressor(random_state=(int(self.rand_state)))),
            "CATBOOST REGRESSOR":(CatBoostRegressor(random_state=(int(self.rand_state)),verbose=0)),
            "LIGHT GBM REGRESSOR":(LGBMRegressor(random_state=(int(self.rand_state)),verbose=0)),
            "GAUSSIAN PROCESS REGRESSOR":(GaussianProcessRegressor(kernel=(1.0 * RBF(length_scale=1.0)), n_restarts_optimizer=10))
        }

        if scoring == 'MAE':
            score_dict = {}
            for x in models:
                print(f"---- TRAINING {x} ----")
                scores = -(cross_val_score(models[x], self.X, self.y, cv=5, scoring="neg_mean_absolute_error"))
                score_dict[x] = f"MAE: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})"           
            
            print("---- TRAINING COMPLETE ----")
            for x in score_dict:
                print(f'{x} {score_dict[x]}')

        elif scoring == 'r2':
            score_dict = {}
            for x in models:
                print(f"---- TRAINING {x} ----")
                scores = cross_val_score(models[x], self.X, self.y, cv=5, scoring="r2")           
                score_dict[x] = f"r2: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})" 

            print("---- TRAINING COMPLETE ----")
            for x in score_dict:
                print(f'{x} {score_dict[x]}')

        else:
            print('input r2 or MAE as an argument')

In [None]:
# load data from pickel file 

df = pd.read_pickle('mat_bandgap_morgan_rdkit.pkl')

# Get the columns with arrays
array_cols = df.drop(['homo','lumo','gap','confnum','smiles'], axis=1)

# Stack each column, then concatenate horizontally
X = np.hstack([np.vstack(array_cols[col].values) for col in array_cols.columns])
y = df['gap'].values
y = y.ravel()

In [20]:
test_object = RegressionTest(X, y, 42)
test_object.test_models('r2')

---- TRAINING DECISION TREE REGRESSOR ----
---- TRAINING RANDOM FOREST REGRESSOR ----
---- TRAINING XGB REGRESSOR ----
---- TRAINING CATBOOST REGRESSOR ----
---- TRAINING LIGHT GBM REGRESSOR ----








---- TRAINING GAUSSIAN PROCESS REGRESSOR ----




---- TRAINING COMPLETE ----
DECISION TREE REGRESSOR r2: 0.4407 (+/- 0.3274)
RANDOM FOREST REGRESSOR r2: 0.6859 (+/- 0.1973)
XGB REGRESSOR r2: 0.6513 (+/- 0.4019)
CATBOOST REGRESSOR r2: 0.7780 (+/- 0.0751)
LIGHT GBM REGRESSOR r2: 0.6592 (+/- 0.1417)
GAUSSIAN PROCESS REGRESSOR r2: nan (+/- nan)


Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\pytorch_env\Lib\site-packages\sklearn\metrics\_scorer.py", line 152, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\envs\pytorch_env\Lib\site-packages\sklearn\metrics\_scorer.py", line 400, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\envs\pytorch_env\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\envs\pytorch_env\Lib\site-packages\sklearn\utils\_response.py", line 242, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
                        ^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\envs\pytorch_env\Lib\site-packages\sklearn\gaussian_process\_gpr.py", line 412, in predict
    X = validate_data(self, X, ensure_2d=ensure_2d, dtype=dtype, reset

for this case, catboost is identified as the best model, with random forest second. 

In [33]:
# tune catboost model using optuna:

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 845, 845),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.1, log=True),
        'depth': trial.suggest_int('depth', 1, 5),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 3.7, 3.7),
        'random_strength': trial.suggest_float('random_strength', 1.4, 1.4),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.58, 0.58),
        'verbose': 0
    }

    model = CatBoostRegressor(**params)

    scores = cross_val_score(model, X, y, cv=5, scoring="r2")
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(study.best_params)

[I 2026-01-25 18:53:21,957] A new study created in memory with name: no-name-24d5c85b-97f2-4127-a3cb-b6514606d6e6
[I 2026-01-25 18:53:30,903] Trial 0 finished with value: 0.7259410858361033 and parameters: {'iterations': 845, 'learning_rate': 0.1, 'depth': 1, 'l2_leaf_reg': 3.7, 'random_strength': 1.4, 'bagging_temperature': 0.58}. Best is trial 0 with value: 0.7259410858361033.
[I 2026-01-25 18:53:48,625] Trial 1 finished with value: 0.7768029480121719 and parameters: {'iterations': 845, 'learning_rate': 0.1, 'depth': 2, 'l2_leaf_reg': 3.7, 'random_strength': 1.4, 'bagging_temperature': 0.58}. Best is trial 1 with value: 0.7768029480121719.
[I 2026-01-25 18:54:14,397] Trial 2 finished with value: 0.7834880040803721 and parameters: {'iterations': 845, 'learning_rate': 0.1, 'depth': 3, 'l2_leaf_reg': 3.7, 'random_strength': 1.4, 'bagging_temperature': 0.58}. Best is trial 2 with value: 0.7834880040803721.
[I 2026-01-25 18:54:24,213] Trial 3 finished with value: 0.7259410858361033 and pa

KeyboardInterrupt: 

In [25]:
model = CatBoostRegressor(random_state=42, iterations=845, learning_rate=0.1, depth=4, l2_leaf_reg=3.7, random_strength=1.4, bagging_temperature=0.58)

scores = -(cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error"))
print(f'MAE: {np.mean(scores)} +/- {np.std(scores)}')

scores = cross_val_score(model, X, y, cv=5, scoring="r2")
print(f"r2: {np.mean(scores)} +/- {np.std(scores)}")

0:	learn: 0.0251177	total: 6.84ms	remaining: 5.77s
1:	learn: 0.0245464	total: 12.5ms	remaining: 5.27s
2:	learn: 0.0239282	total: 18ms	remaining: 5.05s
3:	learn: 0.0232284	total: 23.9ms	remaining: 5.03s
4:	learn: 0.0225284	total: 29.5ms	remaining: 4.96s
5:	learn: 0.0219184	total: 35ms	remaining: 4.89s
6:	learn: 0.0213696	total: 41.6ms	remaining: 4.98s
7:	learn: 0.0208165	total: 47.3ms	remaining: 4.95s
8:	learn: 0.0202062	total: 52.9ms	remaining: 4.91s
9:	learn: 0.0197999	total: 59.2ms	remaining: 4.94s
10:	learn: 0.0195394	total: 64.6ms	remaining: 4.9s
11:	learn: 0.0192728	total: 70.8ms	remaining: 4.92s
12:	learn: 0.0187797	total: 78.6ms	remaining: 5.03s
13:	learn: 0.0186829	total: 84.1ms	remaining: 4.99s
14:	learn: 0.0185331	total: 89.7ms	remaining: 4.96s
15:	learn: 0.0178556	total: 95.2ms	remaining: 4.93s
16:	learn: 0.0176076	total: 101ms	remaining: 4.91s
17:	learn: 0.0172928	total: 107ms	remaining: 4.9s
18:	learn: 0.0168170	total: 112ms	remaining: 4.88s
19:	learn: 0.0165975	total: 118

In [32]:
def objective(trial):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth',10, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf',1, 2),
        'bootstrap': trial.suggest_categorical('bootstrap',[True, False])
        }


    model = RandomForestRegressor(**params)

    scores = cross_val_score(model, X, y, cv=5, scoring="r2")
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(study.best_params)

[I 2026-01-25 18:34:33,056] A new study created in memory with name: no-name-720cd31b-f5f8-4b57-9c09-b6f94c53af07
[I 2026-01-25 18:34:56,747] Trial 0 finished with value: 0.5421861757765953 and parameters: {'n_estimators': 156, 'max_depth': 11, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 0 with value: 0.5421861757765953.
[I 2026-01-25 18:35:11,194] Trial 1 finished with value: 0.6845336567303695 and parameters: {'n_estimators': 146, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 1 with value: 0.6845336567303695.
[I 2026-01-25 18:35:24,422] Trial 2 finished with value: 0.6752075157934575 and parameters: {'n_estimators': 117, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 1 with value: 0.6845336567303695.
[I 2026-01-25 18:35:44,328] Trial 3 finished with value: 0.5386375385777767 and parameters: {'n_estimators': 119, 'max_depth': 12, 'min_samples_spl

{'n_estimators': 192, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': True}


best model is a tuned catboost regressor:

MAE: 0.0077 +/- 0.0009

R2: 0.79 +/- 0.04