In [18]:
# Cell 1: Install Dependencies
# Run this cell first!
!pip install h2o
!pip install tpot
!pip install mljar-supervised



In [19]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# 1. Load Data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(housing.data, columns=housing.feature_names),
    pd.Series(housing.target, name='Price'),
    test_size=0.2,
    random_state=42
)
print("Data Loaded successfully.")

Data Loaded successfully.


In [20]:
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O
h2o.init()

# Convert to H2O format
train_hf = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_hf = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

# Run AutoML
aml = H2OAutoML(max_models=5, max_runtime_secs=60, seed=42)
aml.train(y='Price', training_frame=train_hf)

# Results
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 3 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.9
H2O_cluster_version_age:,1 month and 20 days
H2O_cluster_name:,H2O_from_python_unknownUser_uedgau
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.126 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                rmse       mse       mae       rmsle    mean_residual_deviance
GBM_1_AutoML_2_20260113_221356      0.460705  0.21225   0.304322    0.139069                  0.21225
XGBoost_1_AutoML_2_20260113_221356  0.501019  0.25102   0.334594    0.153306                  0.25102
XGBoost_2_AutoML_2_20260113_221356  0.508323  0.258392  0.342598    0.155179                  0.258392
GLM_1_AutoML_2_20260113_221356      0.720958  0.519781  0.529129  nan                         0.519781
[4 rows x 6 columns]



In [25]:
from tpot import TPOTRegressor
from sklearn.metrics import r2_score

tpot = TPOTRegressor(
    generations=1,
    population_size=5,
    max_time_mins=2,
    random_state=42
)
# Train on a smaller slice of data if needed
tpot.fit(X_train[:1000], y_train[:1000]) # Uses only first 1000 rows

# Get predictions from the best model
y_pred = tpot.predict(X_test)

# Calculate the R2 score
tpot_score = r2_score(y_test, y_pred)
print(f"TPOT R2 Score: {tpot_score}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39621 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:35647
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:39621/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:34819'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:38785 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:38785
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:41436
INFO:distributed.scheduler:Receive client connection: Client-800f33c2-f0ce-11f0-837c-0242ac1c000c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:41448
Generation: 100%|██████████| 1/1 [00:46<00:00, 46.84s/it]
INFO:distributed.scheduler:Retire worker addresses (stimulus_id='retire-workers-1768343064.694578') (0,)
INFO:distributed.

TPOT R2 Score: 0.7319646004614612


In [22]:
# import autosklearn.regression
# import sklearn.metrics

# automl_askl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task=120, per_run_time_limit=30)
# automl_askl.fit(X_train, y_train)
# y_pred = automl_askl.predict(X_test)
# print("Auto-sklearn R2 score:", sklearn.metrics.r2_score(y_test, y_pred))

In [23]:
from supervised.automl import AutoML

# Use mode="Explain" for fast results
automl_mljar = AutoML(mode="Explain")
automl_mljar.fit(X_train, y_train)
print("MLJAR R2 score:", automl_mljar.score(X_test, y_test))

Linear algorithm was disabled.
AutoML directory: AutoML_2
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 1.159509 trained in 1.24 seconds
2_DecisionTree rmse 0.805325 trained in 3.17 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 0.468276 trained in 8.39 seconds
4_Default_NeuralNetwork rmse 0.55069 trained in 2.32 seconds
5_Default_RandomForest rmse 0.729023 trained in 5.42 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.467368 trained in 0.28 seconds
AutoML fit time: 29.19 seconds
AutoML best model: Ensemble
MLJAR R2 score: 0.8403741737876076
