In [1]:
#!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [2]:
import h2o
from h2o.estimators import (
    H2OGeneralizedLinearEstimator, 
    H2ORandomForestEstimator, 
    H2OGradientBoostingEstimator, 
    H2ONaiveBayesEstimator,
    H2OStackedEnsembleEstimator,
    H2ODeepLearningEstimator

)
from h2o.frame import H2OFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
  Starting server from D:\Archivos de programa\Anaconda3\envs\Master_1\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\david\AppData\Local\Temp\tmp73tmtmmz
  JVM stdout: C:\Users\david\AppData\Local\Temp\tmp73tmtmmz\h2o_david_started_from_python.out
  JVM stderr: C:\Users\david\AppData\Local\Temp\tmp73tmtmmz\h2o_david_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,2 months and 4 days
H2O_cluster_name:,H2O_from_python_david_9k1xw8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.979 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


### GLOBAL PRESETS

In [349]:
import warnings
warnings.filterwarnings('ignore')

TEST_SIZE = 0.2
N_FOLDS = 5

### DATA LOADING AND PREPROCESSING

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import pandas as pd

In [348]:
spam_data = fetch_openml(data_id=44, as_frame=True)  
spam_df = spam_data.frame
# Split into features and target
X = spam_df.iloc[:, :-1]  # All columns except the last are features
y = spam_df.iloc[:, -1]   # The last column is the target (spam or not)

# Convert target to numeric
y = y.astype(int)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

# Convert to H2O Frames
h2o_train = H2OFrame(pd.DataFrame(X_train).assign(label=y_train.values))
h2o_test = H2OFrame(pd.DataFrame(X_test).assign(label=y_test.values))

h2o_train['label'] = h2o_train['label'].asfactor()
h2o_test['label'] = h2o_test['label'].asfactor()

# Example of dataset
X.head()

  warn(


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191


### Is SPAM class underepresented?

In [6]:
print(f"Records containing spam: {len(spam_df[spam_df['class'] == '1'])}")
print(f"Records not containing spam: {len(spam_df[spam_df['class'] == '0'])}")

Records containing spam: 1813
Records not containing spam: 2788


### BASE LEARNERS - SPECIFICATION

When choosing the base learners for the first layer we have considered the properties of the dataset and the task in this case being binary classification. As the paper XXX suggests "the first layer should consist of diverse algorithms with different inductive biases to ensure a rich set of predictions for the meta-learner".

We have selected: 

**Random Forest:**

Because they are robust to overfitting on datasets with many features and they handle noisy or irrelevant features well, which is can be a thing in this case.

**Generalized Linear Model - Logistic regression:**

We chose to add it because it's a simple yet effective baseline model, especially logistic regression for binary classification. It should provide a low-variance learner to complement the other high-variance ones.

**Deep Learning (H20's MLP):**

We add the neural networks, because of it's flexibility so it could capture non-linear relationships which should broaden the diversity of the stacks prediction.

**Naive Bayes:**

Why: Spam datasets often benefit from Naive Bayes since it assumes independence among features and thus might capture something more general than the other models.

**Gradient Boosting Machines:**

We choose them as another complement ensemble method that can capture rather complex relationship and so maybe overfit more to the data.


We assume the simpler models like naive bayes and logistic regression should bring in the stack a more general view without focusing too much on the quirks in the data and to balance it out we have selected a more accurate and flexible methods like MLP or GBM.

In [371]:
base_learners = {
    "LogisticRegression": H2OGeneralizedLinearEstimator(family="binomial", nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "RandomForest": H2ORandomForestEstimator(ntrees=50, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "GradientBoosting": H2OGradientBoostingEstimator(ntrees=50, max_depth=5, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True),
    "NaiveBayes": H2ONaiveBayesEstimator(nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True)
}

# TODO consider balance_class=True

base_learners = {
    "LogisticRegression_binomial": H2OGeneralizedLinearEstimator(
        family="binomial", nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_50trees": H2ORandomForestEstimator(
        ntrees=50, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
        "RandomForest_50trees_unbounded_D": H2ORandomForestEstimator(
        ntrees=50, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_10trees": H2ORandomForestEstimator(
        ntrees=10, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_10trees_unbounded_D": H2ORandomForestEstimator(
        ntrees=10, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    # "GradientBoosting": H2OGradientBoostingEstimator(
    #     ntrees=50, max_depth=5, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True, balance_classes=True
    # ),
    "NaiveBayes": H2ONaiveBayesEstimator(
        nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "NeuralNetwork": H2ODeepLearningEstimator(
        hidden=[32, 16], epochs=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "NeuralNetwork": H2ODeepLearningEstimator(
        hidden=[32, 16], epochs=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    )
}

base_learners = {
    "LogisticRegression_binomial": H2OGeneralizedLinearEstimator(
        family="binomial", nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_50trees": H2ORandomForestEstimator(
        ntrees=50, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
        "RandomForest_50trees_unbounded_D": H2ORandomForestEstimator(
        ntrees=50, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_10trees": H2ORandomForestEstimator(
        ntrees=10, max_depth=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "RandomForest_10trees_unbounded_D": H2ORandomForestEstimator(
        ntrees=10, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    # "GradientBoosting": H2OGradientBoostingEstimator(
    #     ntrees=50, max_depth=5, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True, balance_classes=True
    # ),
    "NaiveBayes": H2ONaiveBayesEstimator(
        nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "NeuralNetwork": H2ODeepLearningEstimator(
        hidden=[32, 16], epochs=300, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "NeuralNetwork": H2ODeepLearningEstimator(
        hidden=[32, 32], epochs=300, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    ),
    "NeuralNetwork": H2ODeepLearningEstimator(
        hidden=[32], epochs=300, nfolds=N_FOLDS, seed=42, keep_cross_validation_predictions=True
    )
}

### BASE LEARNERS - TRAINING & EVALUATION

In [8]:
# Train each base learner using cross-validation
for name, learner in base_learners.items():
    print(f"Training {name} with {N_FOLDS}-fold cross-validation...")
    learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

results = {}
for name, learner in base_learners.items():
    performance = learner.model_performance(test_data=h2o_test)
    f1_score = performance.F1()[0][1]  
    auc_pr = performance.aucpr()      
    accuracy = performance.accuracy()[0][1]
    results[name] = accuracy
    results[name] = {"F1-Score": f1_score, "AUC-PR": auc_pr, "Accuracy": accuracy}
    print(f"{name} - F1-Score: {f1_score:.4f}, AUC-PR: {auc_pr:.4f}, Accuracy (Test Set): {accuracy:.4f}")


# Print results
print("Base Learner Results:", results)

base_models = list(base_learners.values())

Training LogisticRegression with 5-fold cross-validation...
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training RandomForest with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training GradientBoosting with 5-fold cross-validation...
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Training NaiveBayes with 5-fold cross-validation...
naivebayes Model Build progress: |███████████████████████████████████████████████| (done) 100%
LogisticRegression - F1-Score: 0.9125, AUC-PR: 0.9597, Accuracy (Test Set): 0.9251
RandomForest - F1-Score: 0.9375, AUC-PR: 0.9790, Accuracy (Test Set): 0.9479
GradientBoosting - F1-Score: 0.9490, AUC-PR: 0.9848, Accuracy (Test Set): 0.9566
NaiveBayes - F1-Score: 0.8303, AUC-PR: 0.8182, Accuracy (Test Set): 0.8588
Base Learner Results: {'LogisticRegression': {'F1-Score': 0.9125475285171102, 'A

In [379]:
def train_evaluate_stack(base_learners, metalearner, h2o_train, h2o_test, X_train):

    # TRAIN BASE LEARNERS
    print("\n>>> Training base learners:\n")
    for name, learner in base_learners.items():
        print(f"    Training {name} with {N_FOLDS}-fold cross-validation...")
        learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

    # TRAIN THE METALEARNER
    print("\n>>> Training super learner:\n")
    super_learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

    # EVAL BASE LEARNERS
    print("\n>>> Base learners' results:\n")
    results = {}
    for name, learner in base_learners.items():
        performance = learner.model_performance(test_data=h2o_test)
        f1_score = performance.F1()[0][1]  
        auc_pr = performance.aucpr()      
        accuracy = performance.accuracy()[0][1]
        results[name] = accuracy
        results[name] = {"F1-Score": f1_score, "AUC-PR": auc_pr, "Accuracy": accuracy}
        print(f"    {name} - F1-Score: {f1_score:.4f}, AUC-PR: {auc_pr:.4f}, Accuracy (Test Set): {accuracy:.4f}")
    
    # EVAL THE METALEARNER
    print("\n>>> Metalearner's results:\n")
    super_performance = super_learner.model_performance(test_data=h2o_test)
    super_accuracy = super_performance.accuracy()[0][1]
    super_f1 = super_performance.F1()[0][1]  
    super_auc_pr = super_performance.aucpr()  
    # print(f"\n    Super Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f} | Super Learner Accuracy: {super_accuracy:.4f}")
    
    
    # print("\nFinal Results Comparison:")
    # for name, metrics in results.items():
    #     print(f"{name} - F1-Score: {metrics['F1-Score']:.4f}, AUC-PR: {metrics['AUC-PR']:.4f}, Accuracy: {metrics['Accuracy']:.4f}")
        
    print(f"    Super Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f}, Accuracy: {super_accuracy:.4f}")


### METALEARNER - TRAINING & EVALUATION


For the metalearner we principally selected two possible options for testing:

**GLM:**

We chose logistic regression because it is simple and interpretable and as a meta-learner we want it just combine the predictions of individual learners by weighting the them reducing the risk of overfitting when combining predictions. So in this case we are more focused on finding the best combination of predictions rather than adding more complexity.

**Gradient Boosting Machine / MLP:**

As an alternative second option we wanted something stronger, a bit of a bigger hammer sort to say, especially for our stacks which are more diverse in which case their predictions could be more complex, so they could capture non-linear relationships among them.

In [9]:
# TRAIN
super_learner = H2OStackedEnsembleEstimator(
    base_models=base_models,
    metalearner_algorithm="glm"  # Uses Logistic Regression as the metalearner
)

# super_learner = H2OStackedEnsembleEstimator(
#     base_models=base_models,
#     metalearner_algorithm="deeplearning"  # Uses Logistic Regression as the metalearner
# )

print("\nTraining Super Learner:\n")
super_learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

# EVAL
super_performance = super_learner.model_performance(test_data=h2o_test)
super_accuracy = super_performance.accuracy()[0][1]
super_f1 = super_performance.F1()[0][1]  
super_auc_pr = super_performance.aucpr()  
# print(f"\nSuper Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f} | Super Learner Accuracy: {super_accuracy:.4f}")


print("\nFinal Results Comparison:\n")
for name, metrics in results.items():
    print(f"    {name} - F1-Score: {metrics['F1-Score']:.4f}, AUC-PR: {metrics['AUC-PR']:.4f}, Accuracy: {metrics['Accuracy']:.4f}")
    
print(f"    Super Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f}, Accuracy: {super_accuracy:.4f}")





Training Super Learner...
stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%

Super Learner - F1-Score: 0.9537, AUC-PR: 0.9839 | Super Learner Accuracy: 0.9609

Final Results Comparison:
LogisticRegression - F1-Score: 0.9125, AUC-PR: 0.9597, Accuracy: 0.9251
RandomForest - F1-Score: 0.9375, AUC-PR: 0.9790, Accuracy: 0.9479
GradientBoosting - F1-Score: 0.9490, AUC-PR: 0.9848, Accuracy: 0.9566
NaiveBayes - F1-Score: 0.8303, AUC-PR: 0.8182, Accuracy: 0.8588
Super Learner - F1-Score: 0.9537, AUC-PR: 0.9839, Accuracy: 0.9609


In [381]:
train_evaluate_stack(base_learners, super_learner, h2o_train, h2o_test, X_train)


>>> Training base learners:

    Training LogisticRegression_binomial with 5-fold cross-validation...
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
    Training RandomForest_50trees with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
    Training RandomForest_50trees_unbounded_D with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
    Training RandomForest_10trees with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
    Training RandomForest_10trees_unbounded_D with 5-fold cross-validation...
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
    Training NaiveBayes with 5-fold cross-validation...
naivebayes Model Build progress: |████████████████████████████████████████████

## TRYING OUT GRID SEARCH FROM
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html

In [10]:
from h2o.grid.grid_search import H2OGridSearch

# Specify GBM hyperparameters for the grid
hyper_params = {"learn_rate": [0.01, 0.03],
                "max_depth": [3, 4, 5, 6, 9],
                "sample_rate": [0.7, 0.8, 0.9, 1.0],
                "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}

# Train the grid
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
                                                        seed=1,
                                                        nfolds=N_FOLDS,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=hyper_params,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)

# Train a stacked ensemble using the GBM grid
super_learner = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
                                       base_models=grid.model_ids)
super_learner.train(x=list(range(X_train.shape[1])), y="label", training_frame=h2o_train)



super_performance = super_learner.model_performance(test_data=h2o_test)
super_accuracy = super_performance.accuracy()[0][1]
super_f1 = super_performance.F1()[0][1]  
super_auc_pr = super_performance.aucpr()  
print(f"\nSuper Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f} | Super Learner Accuracy: {super_accuracy:.4f}")


print("\nFinal Results Comparison:")
for name, metrics in results.items():
    print(f"{name} - F1-Score: {metrics['F1-Score']:.4f}, AUC-PR: {metrics['AUC-PR']:.4f}, Accuracy: {metrics['Accuracy']:.4f}")
    
print(f"Super Learner - F1-Score: {super_f1:.4f}, AUC-PR: {super_auc_pr:.4f}, Accuracy: {super_accuracy:.4f}")



gbm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: |██████████████████████████████████████████| (done) 100%

Super Learner - F1-Score: 0.9096, AUC-PR: 0.9690 | Super Learner Accuracy: 0.9240

Final Results Comparison:
LogisticRegression - F1-Score: 0.9125, AUC-PR: 0.9597, Accuracy: 0.9251
RandomForest - F1-Score: 0.9375, AUC-PR: 0.9790, Accuracy: 0.9479
GradientBoosting - F1-Score: 0.9490, AUC-PR: 0.9848, Accuracy: 0.9566
NaiveBayes - F1-Score: 0.8303, AUC-PR: 0.8182, Accuracy: 0.8588
Super Learner - F1-Score: 0.9096, AUC-PR: 0.9690, Accuracy: 0.9240
