In [1]:
try:
    import h2o
except:
    !pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [2]:
import h2o
from h2o.estimators import (
    H2OGeneralizedLinearEstimator, 
    H2ORandomForestEstimator, 
    H2OGradientBoostingEstimator, 
    H2ONaiveBayesEstimator,
    H2OStackedEnsembleEstimator
)
from h2o.frame import H2OFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
  Starting server from D:\Archivos de programa\Anaconda3\envs\Master_1\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\david\AppData\Local\Temp\tmpk3h0bmsp
  JVM stdout: C:\Users\david\AppData\Local\Temp\tmpk3h0bmsp\h2o_david_started_from_python.out
  JVM stderr: C:\Users\david\AppData\Local\Temp\tmpk3h0bmsp\h2o_david_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,2 months and 3 days
H2O_cluster_name:,H2O_from_python_david_09j9k5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.979 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [3]:
TEST_SIZE = 0.2
N_FOLDS = 5

# DATA LOADING AND PREPROCESSING

In [4]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [5]:
spam_data = fetch_openml(data_id=44, as_frame=True)  
spam_df = spam_data.data

# Split into features and target
X = spam_data.data
y = spam_data.target

# Convert target to numeric
y = y.astype(int)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to H2O Frames
h2o_train = H2OFrame(pd.DataFrame(X_train).assign(label=y_train.values))
h2o_test = H2OFrame(pd.DataFrame(X_test).assign(label=y_test.values))

h2o_train['label'] = h2o_train['label'].asfactor()
h2o_test['label'] = h2o_test['label'].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


# BASE LEARNERS - SPECIFICATION

In [6]:
base_learners = {
    "LogisticRegression": H2OGeneralizedLinearEstimator(family="binomial", nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "RandomForest": H2ORandomForestEstimator(ntrees=50, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "GradientBoosting": H2OGradientBoostingEstimator(ntrees=50, max_depth=5, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "NaiveBayes": H2ONaiveBayesEstimator(nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True)
}

# BASE LEARNERS - TRAINING & EVALUATION

In [7]:
# Train each base learner using cross-validation
for name, learner in base_learners.items():
    print(f"Training {name} with {N_FOLDS}-fold cross-validation...")
    learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

results = {}
for name, learner in base_learners.items():
    performance = learner.model_performance(test_data=h2o_test)
    accuracy = performance.accuracy()[0][1]
    results[name] = accuracy
    print(f"{name} Accuracy (Test Set): {accuracy:.4f}")

# Print results
print("Base Learner Results:", results)

base_models = list(base_learners.values())

Training LogisticRegression with 5-fold cross-validation...
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training RandomForest with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training GradientBoosting with 5-fold cross-validation...
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training NaiveBayes with 5-fold cross-validation...
naivebayes Model Build progress: |███████████████████████████████████████████████| (done) 100%
LogisticRegression Accuracy (Test Set): 0.9251
RandomForest Accuracy (Test Set): 0.9479
GradientBoosting Accuracy (Test Set): 0.9566
NaiveBayes Accuracy (Test Set): 0.8588
Base Learner Results: {'LogisticRegression': 0.9250814332247557, 'RandomForest': 0.9478827361563518, 'GradientBoosting': 0.9565689467969598, 'NaiveBayes': 0.8588490770901195}


# METALEARNER - TRAINING & EVALUATION

In [8]:
metalearner = H2OGeneralizedLinearEstimator(family="binomial")

# TRAIN
super_learner = H2OStackedEnsembleEstimator(
    base_models=base_models,
    metalearner_algorithm="glm"  # Uses Logistic Regression as the metalearner
)

print("\nTraining Super Learner...")
super_learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

# EVAL
super_performance = super_learner.model_performance(test_data=h2o_test)
super_accuracy = super_performance.accuracy()[0][1]
print(f"\nSuper Learner Accuracy: {super_accuracy:.4f}")


print("\nFinal Results Comparison:")
for name, acc in results.items():
    print(f"{name} Accuracy: {acc:.4f}")
print(f"Super Learner Accuracy: {super_accuracy:.4f}")



Training Super Learner...
stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%

Super Learner Accuracy: 0.9609

Final Results Comparison:
LogisticRegression Accuracy: 0.9251
RandomForest Accuracy: 0.9479
GradientBoosting Accuracy: 0.9566
NaiveBayes Accuracy: 0.8588
Super Learner Accuracy: 0.9609
