## 5. H2O AutoML and Hyperparameter Search

We run AutoML after establishing baselines. AutoML explores multiple model families and hyperparameters to find strong candidates.



## 1. Setup

We load core libraries and enable the local H2O environment.

Logs will be written to ./h2o_logs and H2O will run in verbose mode.


In [17]:
import os
import sys
import numpy as np
import pandas as pd

# Ensure h2o is available from /tmp/pydeps
sys.path.insert(0, '/tmp/pydeps')
import h2o
from h2o.automl import H2OAutoML

pd.set_option('display.max_columns', 200)
np.random.seed(42)





In [18]:
# --- Run configuration ---
TEST_MODE = False  # Set to True for quick validation
RUN_AUTOML = 1
RUN_XGBOOST = 1

AUTOML_MAX_MODELS = 20
AUTOML_MAX_RUNTIME_SECS = 18000  # 5 hours
AUTOML_NFOLDS = 5

XGB_N_TRIALS = 15  # target 20-30 in final run
XGB_EARLY_STOPPING_ROUNDS = 50
XGB_DEVICE = 'cuda:0'
XGB_TREE_METHOD = 'hist'

if TEST_MODE:
    AUTOML_MAX_MODELS = 1
    AUTOML_MAX_RUNTIME_SECS = 300
    AUTOML_NFOLDS = 2
    XGB_N_TRIALS = 1

os.makedirs('results', exist_ok=True)
os.makedirs('h2o_logs', exist_ok=True)


## 2. Data loading

We load the OULAD CSV files and build the same final dataset used in the main notebook.


In [19]:
# CHANGE THIS to your local folder containing the 7 OULAD CSV files
DEFAULT_DATA_DIR = os.path.join('.', 'data')

required_files = [
    'assessments.csv','courses.csv','studentAssessment.csv','studentInfo.csv',
    'studentRegistration.csv','studentVle.csv','vle.csv'
]

# Resolve DATA_DIR from common locations
candidate_dirs = [DEFAULT_DATA_DIR, '.']
resolved = None
for d in candidate_dirs:
    if all(os.path.exists(os.path.join(d, f)) for f in required_files):
        resolved = d
        break

if resolved is None:
    missing = [f for f in required_files if not os.path.exists(os.path.join(DEFAULT_DATA_DIR, f))]
    print('Missing files in DEFAULT_DATA_DIR:', missing)
    print('DEFAULT_DATA_DIR currently set to:', os.path.abspath(DEFAULT_DATA_DIR))
    print('Also checked:', os.path.abspath('.'))
else:
    DATA_DIR = resolved
    print('Using DATA_DIR:', os.path.abspath(DATA_DIR))


Using DATA_DIR: d:\Project\DBM_FINAL\data


In [20]:
def read_csv(name, usecols=None, dtype=None):
    path = os.path.join(DATA_DIR, name)
    return pd.read_csv(path, usecols=usecols, dtype=dtype, low_memory=False)

assessments = read_csv('assessments.csv')
courses = read_csv('courses.csv')
student_info = read_csv('studentInfo.csv')
student_reg = read_csv('studentRegistration.csv')
student_assess = read_csv('studentAssessment.csv')
vle = read_csv('vle.csv')

student_vle = read_csv(
    'studentVle.csv',
    usecols=['code_module','code_presentation','id_student','id_site','date','sum_click'],
    dtype={
        'code_module':'category','code_presentation':'category',
        'id_student':'int32','id_site':'int32',
        'date':'int16','sum_click':'int32'
    }
)


## 3. Cleaning and feature engineering

We apply the same cleaning and feature engineering used in the main notebook, then drop leakage features and remove zero-activity rows.


In [21]:
# --- assessments: drop known invalid IDs
invalid_ids = {40087, 40088}
assessments = assessments[~assessments['id_assessment'].isin(invalid_ids)].copy()
assessments['weight'] = pd.to_numeric(assessments['weight'], errors='coerce')
assessments['date'] = pd.to_numeric(assessments['date'], errors='coerce')

# --- studentAssessment: clean score/date
student_assess['score'] = student_assess['score'].replace(['?', '', ' '], np.nan)
student_assess['score'] = pd.to_numeric(student_assess['score'], errors='coerce')
student_assess['date_submitted'] = pd.to_numeric(student_assess['date_submitted'], errors='coerce')
student_assess['is_banked'] = pd.to_numeric(student_assess['is_banked'], errors='coerce').fillna(0).astype(int)
student_assess = student_assess.dropna(subset=['score']).copy()

# --- studentInfo cleanup
student_info['imd_band'] = student_info['imd_band'].replace('?', np.nan)
student_info['age_band'] = student_info['age_band'].replace({'55<=': '>=55'})

# --- studentVle: drop duplicates
student_vle = student_vle.drop_duplicates().copy()


In [22]:
# --- target
student_info['final_result_upd'] = student_info['final_result'].replace({
    'Distinction': 'Pass',
    'Withdrawn': 'Fail'
})
student_info = student_info[student_info['final_result_upd'].isin(['Pass','Fail'])].copy()
student_info['final_result_upd_numeric'] = (student_info['final_result_upd'] == 'Pass').astype(int)

# --- VLE aggregates
vle_agg = (
    student_vle
    .groupby(['id_student','code_module','code_presentation'], as_index=False, observed=True)
    .agg(
        total_sum_click=('sum_click','sum'),
        active_days=('date','nunique'),
        distinct_sites=('id_site','nunique')
    )
)

early = student_vle[student_vle['date'] <= 14]
early_agg = (
    early.groupby(['id_student','code_module','code_presentation'], as_index=False, observed=True)
         .agg(early_sum_click=('sum_click','sum'), early_active_days=('date','nunique'))
)

vle_features = vle_agg.merge(early_agg, on=['id_student','code_module','code_presentation'], how='left')
vle_features[['early_sum_click','early_active_days']] = vle_features[['early_sum_click','early_active_days']].fillna(0)

vle_features['early_click_ratio'] = vle_features['early_sum_click'] / vle_features['total_sum_click'].replace(0, np.nan)
vle_features['early_click_ratio'] = vle_features['early_click_ratio'].fillna(0)

vle_features['clicks_per_active_day'] = vle_features['total_sum_click'] / vle_features['active_days'].replace(0, np.nan)
vle_features['clicks_per_active_day'] = vle_features['clicks_per_active_day'].fillna(0)

vle_features['early_clicks_per_active_day'] = vle_features['early_sum_click'] / vle_features['early_active_days'].replace(0, np.nan)
vle_features['early_clicks_per_active_day'] = vle_features['early_clicks_per_active_day'].fillna(0)

vle_features['early_active_ratio'] = vle_features['early_active_days'] / vle_features['active_days'].replace(0, np.nan)
vle_features['early_active_ratio'] = vle_features['early_active_ratio'].fillna(0)

# activity type aggregates
vle_types = student_vle.merge(vle[['id_site','activity_type']], on='id_site', how='left')

vle_type_agg = (
    vle_types.groupby(['id_student','code_module','code_presentation','activity_type'], observed=True)
             .agg(type_clicks=('sum_click','sum'))
             .reset_index()
)

vle_type_pivot = vle_type_agg.pivot_table(
    index=['id_student','code_module','code_presentation'],
    columns='activity_type',
    values='type_clicks',
    fill_value=0
).reset_index()

for col in vle_type_pivot.columns:
    if col not in ['id_student','code_module','code_presentation']:
        vle_type_pivot = vle_type_pivot.rename(columns={col: f'clicks_type_{col}'})

vle_features = vle_features.merge(vle_type_pivot, on=['id_student','code_module','code_presentation'], how='left')

# diversity features
activity_cols = [c for c in vle_features.columns if c.startswith('clicks_type_')]
type_sum = vle_features[activity_cols].sum(axis=1).replace(0, np.nan)
type_probs = vle_features[activity_cols].div(type_sum, axis=0).fillna(0)
vle_features['distinct_activity_types'] = (vle_features[activity_cols] > 0).sum(axis=1)
vle_features['activity_entropy'] = -(type_probs * np.log(type_probs + 1e-9)).sum(axis=1)
vle_features['top_type_share'] = type_probs.max(axis=1)

vle_features['log1p_total_sum_click'] = np.log1p(vle_features['total_sum_click'])
vle_features['log1p_early_sum_click'] = np.log1p(vle_features['early_sum_click'])


  vle_type_pivot = vle_type_agg.pivot_table(


In [23]:
# --- assessment features (kept for completeness, later dropped)
sa = student_assess.merge(
    assessments[['id_assessment','code_module','code_presentation','assessment_type','weight','date']],
    on='id_assessment', how='left'
)

sa['weighted_score'] = np.where(sa['is_banked'] == 1, 0, sa['score'] * (sa['weight'] / 100.0))
sa['lateness_days'] = sa['date_submitted'] - sa['date']

assess_agg = (
    sa.groupby(['id_student','code_module','code_presentation'], as_index=False, observed=True)
      .agg(
          exam_weighted=('weighted_score', lambda x: x[sa.loc[x.index,'assessment_type'].eq('Exam')].sum()),
          non_exam_weighted=('weighted_score', lambda x: x[~sa.loc[x.index,'assessment_type'].eq('Exam')].sum()),
          mean_score=('score','mean'),
          late_submissions=('lateness_days', lambda s: (s>0).sum())
      )
)

assess_agg['has_exam'] = (assess_agg['exam_weighted'] > 0).astype(int)
assess_agg['has_non_exam'] = (assess_agg['non_exam_weighted'] > 0).astype(int)
assess_agg['overall_grade'] = np.where(
    (assess_agg['has_exam']==1) & (assess_agg['has_non_exam']==1),
    (assess_agg['exam_weighted'] + assess_agg['non_exam_weighted']) / 2.0,
    np.where(assess_agg['has_exam']==1, assess_agg['exam_weighted'], assess_agg['non_exam_weighted'])
)


In [24]:
# --- registration features
student_reg['date_registration'] = pd.to_numeric(student_reg['date_registration'], errors='coerce')
student_reg['date_unregistration'] = pd.to_numeric(student_reg['date_unregistration'], errors='coerce')

reg_features = student_reg[['id_student','code_module','code_presentation','date_registration','date_unregistration']].copy()
reg_features['unregistered_flag'] = reg_features['date_unregistration'].notna().astype(int)


In [25]:
# --- build final dataset
final_data = (
    student_info
      .merge(vle_features, on=['id_student','code_module','code_presentation'], how='left')
      .merge(assess_agg, on=['id_student','code_module','code_presentation'], how='left')
      .merge(reg_features, on=['id_student','code_module','code_presentation'], how='left')
      .merge(courses, on=['code_module','code_presentation'], how='left')
)

# Derived registration and engagement ratios
if 'date_registration' in final_data.columns:
    final_data['registered_early_flag'] = (final_data['date_registration'] < 0).astype(int)
    final_data['registration_lead_days'] = (-final_data['date_registration']).clip(lower=0)

if 'date_unregistration' in final_data.columns:
    final_data['unregistered_flag'] = final_data['date_unregistration'].notna().astype(int)

if 'module_presentation_length' in final_data.columns:
    final_data['active_days_ratio'] = final_data['active_days'] / final_data['module_presentation_length'].replace(0, np.nan)
    final_data['active_days_ratio'] = final_data['active_days_ratio'].fillna(0)

# Drop rows with missing values in this feature list
feature_cols = [
    'total_sum_click','active_days','early_sum_click','early_active_days','early_click_ratio',
    'clicks_per_active_day','early_clicks_per_active_day','early_active_ratio',
    'distinct_sites','distinct_activity_types','activity_entropy','top_type_share',
    'log1p_total_sum_click','log1p_early_sum_click','exam_weighted','non_exam_weighted',
    'mean_score','late_submissions','overall_grade','unregistered_flag',
    'registered_early_flag','registration_lead_days','active_days_ratio'
]

row_na_cols = [c for c in feature_cols if c in final_data.columns]
if row_na_cols:
    before = len(final_data)
    final_data = final_data.dropna(subset=row_na_cols)
    print('Dropped rows with NA in feature list:', before - len(final_data))

# remove zero-activity rows
zero_cols = [c for c in ['total_sum_click','active_days','early_sum_click','early_active_days','distinct_sites'] if c in final_data.columns]
if zero_cols:
    zero_mask = (final_data[zero_cols].sum(axis=1) == 0)
    final_data = final_data.loc[~zero_mask].copy()

# drop leakage features
LEAKAGE_FEATURES = [
    'overall_grade','exam_weighted','non_exam_weighted','mean_score','late_submissions',
    'final_result','final_result_upd'
]
leak_drop = [c for c in LEAKAGE_FEATURES if c in final_data.columns]
if leak_drop:
    final_data = final_data.drop(columns=leak_drop)
    print('Dropped leakage features:', leak_drop)


Dropped rows with NA in feature list: 6827
Dropped leakage features: ['overall_grade', 'exam_weighted', 'non_exam_weighted', 'mean_score', 'late_submissions', 'final_result', 'final_result_upd']


In [26]:
# --- leakage audit (heuristics) ---
print('='*70)
print('LEAKAGE AUDIT')
print('='*70)

# Columns that often encode the label directly
suspicious = [
    c for c in final_data.columns
    if any(k in c.lower() for k in ['final', 'result', 'grade', 'score', 'withdraw', 'unregister'])
]
print('Suspicious columns (name-based):')
print(suspicious)

# Single-feature AUC scan (rough signal)
from sklearn.metrics import roc_auc_score

# target is still present in final_data at this stage
_tmp_target = 'final_result_upd_numeric'
if _tmp_target in final_data.columns:
    y = final_data[_tmp_target].astype(int)
else:
    y = None

auc_rank = []
if y is not None:
    for col in final_data.columns:
        if col == _tmp_target:
            continue
        s = final_data[col]
        if s.isna().all() or s.nunique(dropna=True) <= 1:
            continue
        if s.dtype.kind in 'ifc':
            x = s
        else:
            x = pd.Series(pd.factorize(s)[0], index=s.index)
        try:
            auc = roc_auc_score(y, x)
        except Exception:
            continue
        auc_rank.append((col, auc, int(s.nunique(dropna=True)), str(s.dtype)))

    auc_rank = sorted(auc_rank, key=lambda x: x[1], reverse=True)
    print()
    print('Top 15 single-feature AUCs:')
    for row in auc_rank[:15]:
        print(row)
print('='*70)


LEAKAGE AUDIT
Suspicious columns (name-based):
['final_result_upd_numeric', 'unregistered_flag']

Top 15 single-feature AUCs:
('active_days', 0.8346724183825532, 283, 'float64')
('active_days_ratio', 0.8327413027489992, 1619, 'float64')
('clicks_type_homepage', 0.7984884623869734, 1514, 'float64')
('total_sum_click', 0.7957161310727129, 5252, 'float64')
('log1p_total_sum_click', 0.7957161310727129, 5252, 'float64')
('distinct_sites', 0.7600020139394409, 324, 'float64')
('clicks_type_resource', 0.7211647788097261, 375, 'float64')
('clicks_type_forumng', 0.7202220272763901, 1984, 'float64')
('clicks_type_oucontent', 0.6989721615598872, 2653, 'float64')
('clicks_type_subpage', 0.6818249446526621, 944, 'float64')
('clicks_type_url', 0.6745309029835852, 286, 'float64')
('clicks_type_quiz', 0.6683192395134292, 1865, 'float64')
('distinct_activity_types', 0.64678900300169, 15, 'float64')
('early_sum_click', 0.6197310411109083, 1427, 'float64')
('log1p_early_sum_click', 0.6197310411109083, 142

In [27]:
import h2o
import os
from datetime import datetime

# Enable GPU for XGBoost (will be used in standalone training)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

print('='*70)
print('Initializing H2O')
print('='*70)

# Initialize H2O - this starts a local H2O server
h2o.init(
    max_mem_size='6G',  # Allocate 6GB for H2O
    verbose=True,
    log_level='INFO',
    log_dir='./h2o_logs',
    nthreads=-1  # Use all CPU threads
)

print('\nH2O cluster information:')
print(f'  Cluster name: {h2o.cluster().cloud_name}')
print(f'  H2O version: {h2o.__version__}')
print(f'  Cluster size: {h2o.cluster().cloud_size}')
print('\nH2O initialized successfully!')
print('='*70)


Initializing H2O
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 17.0.12+8-LTS-286, mixed mode, sharing)
  Starting server from C:\Users\Than Minh\AppData\Roaming\Python\Python311\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\THANMI~1\AppData\Local\Temp\tmpw9pmr085
  JVM stdout: C:\Users\THANMI~1\AppData\Local\Temp\tmpw9pmr085\h2o_Than_Minh_started_from_python.out
  JVM stderr: C:\Users\THANMI~1\AppData\Local\Temp\tmpw9pmr085\h2o_Than_Minh_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Ho_Chi_Minh
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.9
H2O_cluster_version_age:,2 months and 8 days
H2O_cluster_name:,H2O_from_python_Than_Minh_oi4er4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20



H2O cluster information:
  Cluster name: H2O_from_python_Than_Minh_oi4er4
  H2O version: 3.46.0.9
  Cluster size: 1

H2O initialized successfully!


In [28]:
# Convert pandas DataFrame to H2O Frame
print('='*70)
print('Converting data to H2O format')
print('='*70)

# Import H2O if not already
from h2o.estimators import H2OGradientBoostingEstimator, H2OGeneralizedLinearEstimator, H2ORandomForestEstimator
from h2o.automl import H2OAutoML

print(f'\nDataset shape: {final_data.shape}')
print(f'Columns: {len(final_data.columns)}')

# Convert to H2O Frame
h2o_data = h2o.H2OFrame(final_data)

print(f'\nH2O Frame created:')
print(f'  Rows: {h2o_data.nrow}')
print(f'  Columns: {h2o_data.ncol}')

# Define target and features
target = 'final_result_upd_numeric'

# Convert target to factor for classification
h2o_data[target] = h2o_data[target].asfactor()

# Get feature names (all columns except target)
features = [col for col in h2o_data.columns if col != target]

print(f'\nTarget: {target}')
print(f'Features: {len(features)}')
print(f'  First 5: {features[:5]}')
print('='*70)


Converting data to H2O format

Dataset shape: (25766, 55)
Columns: 55
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%

H2O Frame created:
  Rows: 25766
  Columns: 55

Target: final_result_upd_numeric
Features: 54
  First 5: ['code_module', 'code_presentation', 'id_student', 'gender', 'region']


In [29]:
# Limit features to a curated whitelist
WHITELIST_FEATURES = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'num_of_prev_attempts', 'studied_credits', 'code_module', 'code_presentation', 'module_presentation_length', 'has_exam', 'has_non_exam', 'registered_early_flag', 'registration_lead_days', 'date_registration', 'clicks_type_dataplus', 'clicks_type_dualpane', 'clicks_type_externalquiz', 'clicks_type_folder', 'clicks_type_forumng', 'clicks_type_glossary', 'clicks_type_homepage', 'clicks_type_htmlactivity', 'clicks_type_oucollaborate', 'clicks_type_oucontent', 'clicks_type_ouelluminate', 'clicks_type_ouwiki', 'clicks_type_page', 'clicks_type_questionnaire', 'clicks_type_quiz', 'clicks_type_repeatactivity', 'clicks_type_resource', 'clicks_type_sharedsubpage', 'clicks_type_subpage', 'clicks_type_url', 'distinct_activity_types', 'activity_entropy', 'top_type_share', 'clicks_per_active_day']

features = [f for f in features if f in WHITELIST_FEATURES]
print('FEATURE LIST (model inputs)')
print(f'Count: {len(features)}')
print(features)

# Save for inspection
import pandas as pd
pd.DataFrame({'feature': features}).to_csv('results/feature_list.csv', index=False)
print('Saved: results/feature_list.csv')


FEATURE LIST (model inputs)
Count: 40
['code_module', 'code_presentation', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'clicks_per_active_day', 'clicks_type_dataplus', 'clicks_type_dualpane', 'clicks_type_externalquiz', 'clicks_type_folder', 'clicks_type_forumng', 'clicks_type_glossary', 'clicks_type_homepage', 'clicks_type_htmlactivity', 'clicks_type_oucollaborate', 'clicks_type_oucontent', 'clicks_type_ouelluminate', 'clicks_type_ouwiki', 'clicks_type_page', 'clicks_type_questionnaire', 'clicks_type_quiz', 'clicks_type_repeatactivity', 'clicks_type_resource', 'clicks_type_sharedsubpage', 'clicks_type_subpage', 'clicks_type_url', 'distinct_activity_types', 'activity_entropy', 'top_type_share', 'has_exam', 'has_non_exam', 'date_registration', 'module_presentation_length', 'registered_early_flag', 'registration_lead_days']
Saved: results/feature_list.csv


In [30]:
# Split data into train, validation, and test sets
print('='*70)
print('Creating train/validation/test splits')
print('='*70)

# Split: 70% train, 15% valid, 15% test
train, valid, test = h2o_data.split_frame(ratios=[0.7, 0.15], seed=42)

print(f'\nData splits:')
print(f'  Training: {train.nrow} rows ({train.nrow/h2o_data.nrow*100:.1f}%)')
print(f'  Validation: {valid.nrow} rows ({valid.nrow/h2o_data.nrow*100:.1f}%)')
print(f'  Test: {test.nrow} rows ({test.nrow/h2o_data.nrow*100:.1f}%)')

# Check class distribution
print(f'\nTarget distribution in training set:')
print(train[target].table())

print('='*70)


Creating train/validation/test splits

Data splits:
  Training: 18095 rows (70.2%)
  Validation: 3841 rows (14.9%)
  Test: 3830 rows (14.9%)

Target distribution in training set:
  final_result_upd_numeric    Count
                         0     7382
                         1    10713
[2 rows x 2 columns]



In [31]:
# Train baseline models
import pandas as pd
from datetime import datetime

timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')

print('='*70)
print('Training Baseline Models')
print('='*70)

baseline_results = []

# GLM
print('\n*** Training GLM ***')
glm = H2OGeneralizedLinearEstimator(family='binomial', seed=42)
glm.train(x=features, y=target, training_frame=train, validation_frame=valid)
perf_glm = glm.model_performance(valid=True)
print(f'  AUC: {perf_glm.auc():.4f}')

baseline_results.append({
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'source': 'baseline',
    'model_type': 'GLM',
    'model_name': 'GLM_baseline',
    'auc': perf_glm.auc(),
    'accuracy': perf_glm.accuracy()[0][1] if perf_glm.accuracy() else None,
    'f1_score': perf_glm.F1()[0][1] if perf_glm.F1() else None,
    'precision': perf_glm.precision()[0][1] if perf_glm.precision() else None,
    'recall': perf_glm.recall()[0][1] if perf_glm.recall() else None,
    'logloss': perf_glm.logloss()
})

# GBM
print('\n*** Training GBM ***')
gbm = H2OGradientBoostingEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
gbm.train(x=features, y=target, training_frame=train, validation_frame=valid)
perf_gbm = gbm.model_performance(valid=True)
print(f'  AUC: {perf_gbm.auc():.4f}')

baseline_results.append({
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'source': 'baseline',
    'model_type': 'GBM',
    'model_name': 'GBM_baseline',
    'auc': perf_gbm.auc(),
    'accuracy': perf_gbm.accuracy()[0][1] if perf_gbm.accuracy() else None,
    'f1_score': perf_gbm.F1()[0][1] if perf_gbm.F1() else None,
    'precision': perf_gbm.precision()[0][1] if perf_gbm.precision() else None,
    'recall': perf_gbm.recall()[0][1] if perf_gbm.recall() else None,
    'logloss': perf_gbm.logloss()
})

# DRF
print('\n*** Training DRF ***')
drf = H2ORandomForestEstimator(ntrees=100, max_depth=10, seed=42)
drf.train(x=features, y=target, training_frame=train, validation_frame=valid)
perf_drf = drf.model_performance(valid=True)
print(f'  AUC: {perf_drf.auc():.4f}')

baseline_results.append({
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'source': 'baseline',
    'model_type': 'DRF',
    'model_name': 'DRF_baseline',
    'auc': perf_drf.auc(),
    'accuracy': perf_drf.accuracy()[0][1] if perf_drf.accuracy() else None,
    'f1_score': perf_drf.F1()[0][1] if perf_drf.F1() else None,
    'precision': perf_drf.precision()[0][1] if perf_drf.precision() else None,
    'recall': perf_drf.recall()[0][1] if perf_drf.recall() else None,
    'logloss': perf_drf.logloss()
})

# Save baseline results
baseline_results = pd.DataFrame(baseline_results)
baseline_results.to_csv(f'results/model_logs_baseline_{timestamp_str}.csv', index=False)

print('\n' + '='*70)
print('Baseline models completed!')
print(f'Results saved to: results/model_logs_baseline_{timestamp_str}.csv')
print('='*70)


Training Baseline Models

*** Training GLM ***
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
  AUC: 0.9127

*** Training GBM ***
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
  AUC: 0.9353

*** Training DRF ***
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
  AUC: 0.9246

Baseline models completed!
Results saved to: results/model_logs_baseline_20260201_163020.csv


## 4. H2O AutoML

We run AutoML to test multiple model families and return the best models ranked by the selected metric.


## 4. Baseline Models

We first train a few standard models to establish a baseline before running AutoML. This gives a clear point of comparison for later tuning.



In [32]:
# H2O AutoML - Single Comprehensive Run
# Excludes XGBoost (trained separately)

if RUN_AUTOML:
    print('='*70)
    print('H2O AutoML - COMPREHENSIVE RUN')
    print('='*70)
    print(f'Training samples: {train.nrow}, Validation samples: {valid.nrow}')
    print(f'Features: {len(features)}')
    print()
    print('Configuration:')
    print(f'  - Max models: {AUTOML_MAX_MODELS}')
    print(f'  - Max runtime: {AUTOML_MAX_RUNTIME_SECS} seconds')
    print('  - Algorithms: GLM, GBM, DRF, DeepLearning, StackedEnsemble')
    print('  - Excluded: XGBoost (will train separately)')
    print('='*70)

    from datetime import datetime
    start_time = datetime.now()
    print()
    print(f"Starting at: {start_time.strftime('%H:%M:%S')}")

    aml = H2OAutoML(
        max_runtime_secs=AUTOML_MAX_RUNTIME_SECS,
        max_models=AUTOML_MAX_MODELS,
        balance_classes=True,
        sort_metric='AUC',
        seed=42,
        exclude_algos=['XGBoost'],
        verbosity='info',
        nfolds=AUTOML_NFOLDS,
        project_name='AutoML_Main'
    )

    print()
    print('Training H2O AutoML...')
    print('-'*70)
    aml.train(x=features, y=target, training_frame=train, validation_frame=valid)

    end_time = datetime.now()
    duration = (end_time - start_time).seconds

    print()
    print('='*70)
    print('H2O AutoML COMPLETED')
    print('='*70)
    print(f"Completed at: {end_time.strftime('%H:%M:%S')}")
    print(f'Duration: {duration // 3600}h {(duration % 3600) // 60}m')
    print(f'Models trained: {len(aml.leaderboard)}')

    print()
    print('*** Top 10 Models (by AUC) ***')
    print(aml.leaderboard.head(10)[['model_id', 'auc', 'logloss']])

    print()
    print('H2O AutoML run completed successfully!')
else:
    aml = None
    print('Skipping H2O AutoML (RUN_AUTOML=0)')


H2O AutoML - COMPREHENSIVE RUN
Training samples: 18095, Validation samples: 3841
Features: 40

Configuration:
  - Max models: 20
  - Max runtime: 18000 seconds
  - Algorithms: GLM, GBM, DRF, DeepLearning, StackedEnsemble
  - Excluded: XGBoost (will train separately)

Starting at: 16:30:31

Training H2O AutoML...
----------------------------------------------------------------------
AutoML progress: |
16:30:31.668: Project: AutoML_Main
16:30:31.669: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
16:30:31.669: Setting stopping tolerance adaptively based on the training frame: 0.007433968324597509
16:30:31.669: Build control seed: 42
16:30:31.669: training frame: Frame key: AutoML_1_20260201_163031_training_py_9_sid_80d6    cols: 55    rows: 18095  chunks: 3    size: 2319123  check

## 5. Evaluate top models

We compute Accuracy and F1 for the top models on the validation set.


In [33]:
# ============================================================
# STANDALONE XGBoost with Hyperparameter Optimization
# ============================================================

if RUN_XGBOOST:
    print('='*70)
    print('XGBoost Training with Hyperparameter Optimization')
    print('='*70)

    # Install optuna if needed
    try:
        import optuna
    except ImportError:
        !pip install optuna
        import optuna

    import xgboost as xgb
    from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, log_loss

    print()
    print('1. Preparing data for XGBoost...')
    train_pd = train.as_data_frame()
    valid_pd = valid.as_data_frame()

    numeric_features = [f for f in features if train_pd[f].dtype in ['int64', 'float64']]
    print(f'   Using {len(numeric_features)} numeric features (out of {len(features)} total)')

    X_train = train_pd[numeric_features]
    y_train = train_pd[target].astype(int)
    X_valid = valid_pd[numeric_features]
    y_valid = valid_pd[target].astype(int)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    print(f'   Training samples: {len(X_train)}')
    print(f'   Validation samples: {len(X_valid)}')
    print(f'   Features: {len(features)}')

    def objective(trial):
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'device': XGB_DEVICE,
            'tree_method': XGB_TREE_METHOD,
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        }

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=params['n_estimators'],
            evals=[(dvalid, 'validation')],
            early_stopping_rounds=XGB_EARLY_STOPPING_ROUNDS,
            verbose_eval=False
        )

        y_pred_proba = model.predict(dvalid)
        auc = roc_auc_score(y_valid, y_pred_proba)
        return auc

    print()
    print('2. Running hyperparameter optimization with Optuna...')
    print(f'   Target: {XGB_N_TRIALS} trials')
    print('   Optimization method: TPE (Tree-structured Parzen Estimator)')

    study = optuna.create_study(
        direction='maximize',
        study_name='xgboost_optimization',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    study.optimize(objective, n_trials=XGB_N_TRIALS, show_progress_bar=True)

    best_params = study.best_params
    best_auc = study.best_value

    print()
    print('3. Best hyperparameters found:')
    for param, value in best_params.items():
        print(f'   {param}: {value}')
    print(f'   Best validation AUC: {best_auc:.4f}')

    print()
    print('4. Training final XGBoost model with best parameters...')
    final_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'device': XGB_DEVICE,
        'tree_method': XGB_TREE_METHOD,
        **best_params
    }
    n_estimators = final_params.pop('n_estimators')
    final_model = xgb.train(
        final_params,
        dtrain,
        num_boost_round=n_estimators,
        evals=[(dtrain, 'train'), (dvalid, 'validation')],
        early_stopping_rounds=XGB_EARLY_STOPPING_ROUNDS,
        verbose_eval=10
    )

    print()
    print('5. Final XGBoost model evaluation:')
    y_pred_proba = final_model.predict(dvalid)
    y_pred = (y_pred_proba > 0.5).astype(int)

    xgb_metrics = {
        'model_type': 'XGBoost_Optimized',
        'model_name': 'XGBoost_Optuna_Best',
        'auc': roc_auc_score(y_valid, y_pred_proba),
        'accuracy': accuracy_score(y_valid, y_pred),
        'f1_score': f1_score(y_valid, y_pred),
        'precision': precision_score(y_valid, y_pred),
        'recall': recall_score(y_valid, y_pred),
        'logloss': log_loss(y_valid, y_pred_proba),
    }

    print(f"   AUC: {xgb_metrics['auc']:.4f}")
    print(f"   Accuracy: {xgb_metrics['accuracy']:.4f}")
    print(f"   F1 Score: {xgb_metrics['f1_score']:.4f}")
    print(f"   Precision: {xgb_metrics['precision']:.4f}")
    print(f"   Recall: {xgb_metrics['recall']:.4f}")
    print(f"   Log Loss: {xgb_metrics['logloss']:.4f}")

    final_model.save_model('results/xgboost_best_model.json')
    print()
    print('Model saved to: results/xgboost_best_model.json')
    print()
    print('='*70)
    print('XGBoost optimization completed successfully!')
    print('='*70)
else:
    xgb_metrics = None
    print('Skipping XGBoost (RUN_XGBOOST=0)')


XGBoost Training with Hyperparameter Optimization

1. Preparing data for XGBoost...


  from .autonotebook import tqdm as notebook_tqdm



   Using 32 numeric features (out of 40 total)



[32m[I 2026-02-01 16:53:54,263][0m A new study created in memory with name: xgboost_optimization[0m


   Training samples: 18095
   Validation samples: 3841
   Features: 40

2. Running hyperparameter optimization with Optuna...
   Target: 15 trials
   Optimization method: TPE (Tree-structured Parzen Estimator)


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()
Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:53:54,852][0m Trial 0 finished with value: 0.9251079476148555 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'n_estimators': 759, 'min_child_weight': 6, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'gamma': 0.2904180608409973, 'reg_alpha': 4.330880728874676, 'reg_lambda': 3.005575058716044}. Best is trial 0 with value: 0.9251079476148555.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:53:59,669][0m Trial 1 finished with value: 0.9304286674466655 and parameters: {'max_depth': 8, 'learning_rate': 0.010725209743171996, 'n_estimators': 973, 'min_child_weight': 9, 'subsample': 0.6849356442713105, 'colsample_bytree': 0.6727299868828402, 'gamma': 0.9170225492671691, 'reg_alpha': 1.5212112147976886, 'reg_lambda': 2.6237821581611893}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:01,821][0m Trial 2 finished with value: 0.9298026504873483 and parameters: {'max_depth': 6, 'learning_rate': 0.02692655251486473, 'n_estimators': 651, 'min_child_weight': 2, 'subsample': 0.7168578594140873, 'colsample_bytree': 0.7465447373174767, 'gamma': 2.28034992108518, 'reg_alpha': 3.925879806965068, 'reg_lambda': 0.9983689107917987}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:02,296][0m Trial 3 finished with value: 0.9259772427418242 and parameters: {'max_depth': 7, 'learning_rate': 0.07500118950416987, 'n_estimators': 141, 'min_child_weight': 7, 'subsample': 0.6682096494749166, 'colsample_bytree': 0.6260206371941118, 'gamma': 4.7444276862666666, 'reg_alpha': 4.828160165372797, 'reg_lambda': 4.041986740582305}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:04,719][0m Trial 4 finished with value: 0.9267265960745088 and parameters: {'max_depth': 5, 'learning_rate': 0.013940346079873234, 'n_estimators': 716, 'min_child_weight': 5, 'subsample': 0.6488152939379115, 'colsample_bytree': 0.798070764044508, 'gamma': 0.17194260557609198, 'reg_alpha': 4.546602010393911, 'reg_lambda': 1.2938999080000846}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:06,132][0m Trial 5 finished with value: 0.9280838619447318 and parameters: {'max_depth': 8, 'learning_rate': 0.028869220380495747, 'n_estimators': 568, 'min_child_weight': 6, 'subsample': 0.6739417822102108, 'colsample_bytree': 0.9878338511058234, 'gamma': 3.8756641168055728, 'reg_alpha': 4.697494707820946, 'reg_lambda': 4.474136752138244}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:06,567][0m Trial 6 finished with value: 0.9257370762716659 and parameters: {'max_depth': 7, 'learning_rate': 0.22999586428143728, 'n_estimators': 179, 'min_child_weight': 2, 'subsample': 0.6180909155642152, 'colsample_bytree': 0.7301321323053057, 'gamma': 1.9433864484474102, 'reg_alpha': 1.3567451588694794, 'reg_lambda': 4.143687545759647}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:08,019][0m Trial 7 finished with value: 0.924697203545751 and parameters: {'max_depth': 5, 'learning_rate': 0.026000059117302653, 'n_estimators': 588, 'min_child_weight': 2, 'subsample': 0.9208787923016158, 'colsample_bytree': 0.6298202574719083, 'gamma': 4.9344346830025865, 'reg_alpha': 3.861223846483287, 'reg_lambda': 0.993578407670862}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:08,856][0m Trial 8 finished with value: 0.9279739624469031 and parameters: {'max_depth': 3, 'learning_rate': 0.1601531217136121, 'n_estimators': 736, 'min_child_weight': 8, 'subsample': 0.9085081386743783, 'colsample_bytree': 0.6296178606936361, 'gamma': 1.7923286427213632, 'reg_alpha': 0.5793452976256486, 'reg_lambda': 4.315517129377968}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:09,556][0m Trial 9 finished with value: 0.9260635216268633 and parameters: {'max_depth': 7, 'learning_rate': 0.030816017044468066, 'n_estimators': 157, 'min_child_weight': 4, 'subsample': 0.7300733288106989, 'colsample_bytree': 0.8918424713352255, 'gamma': 3.1877873567760657, 'reg_alpha': 4.436063712881633, 'reg_lambda': 2.3610746258097466}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:15,282][0m Trial 10 finished with value: 0.9304112702288625 and parameters: {'max_depth': 10, 'learning_rate': 0.010206070557577008, 'n_estimators': 973, 'min_child_weight': 10, 'subsample': 0.8182873120328862, 'colsample_bytree': 0.876098829427658, 'gamma': 1.2772172938259945, 'reg_alpha': 2.020130360093224, 'reg_lambda': 2.396475863679739}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:20,209][0m Trial 11 finished with value: 0.9303983991165041 and parameters: {'max_depth': 10, 'learning_rate': 0.010290509463842875, 'n_estimators': 971, 'min_child_weight': 10, 'subsample': 0.8153994220705195, 'colsample_bytree': 0.8810796931031994, 'gamma': 0.9996988961166513, 'reg_alpha': 2.317959276605608, 'reg_lambda': 2.417094853247184}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:24,515][0m Trial 12 finished with value: 0.9303998135244557 and parameters: {'max_depth': 10, 'learning_rate': 0.014031806090818524, 'n_estimators': 974, 'min_child_weight': 10, 'subsample': 0.8376702913106802, 'colsample_bytree': 0.883574864264175, 'gamma': 0.9081932656466285, 'reg_alpha': 2.371413413327705, 'reg_lambda': 3.191925527471864}. Best is trial 1 with value: 0.9304286674466655.[0m


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


[32m[I 2026-02-01 16:54:25,856][0m Trial 13 finished with value: 0.9294971383698326 and parameters: {'max_depth': 9, 'learning_rate': 0.0651739755127525, 'n_estimators': 887, 'min_child_weight': 9, 'subsample': 0.7667698433310002, 'colsample_bytree': 0.9500805503504125, 'gamma': 1.2077746649016994, 'reg_alpha': 1.5456697915226405, 'reg_lambda': 1.764674394385593}. Best is trial 1 with value: 0.9304286674466655.[0m


Best trial: 1. Best value: 0.930429: 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]

[32m[I 2026-02-01 16:54:29,393][0m Trial 14 finished with value: 0.9297381534847616 and parameters: {'max_depth': 9, 'learning_rate': 0.010230091818268105, 'n_estimators': 851, 'min_child_weight': 8, 'subsample': 0.9979843515077919, 'colsample_bytree': 0.8124496121405841, 'gamma': 2.7939929152165717, 'reg_alpha': 3.059041469985783, 'reg_lambda': 3.2616637833365636}. Best is trial 1 with value: 0.9304286674466655.[0m

3. Best hyperparameters found:
   max_depth: 8
   learning_rate: 0.010725209743171996
   n_estimators: 973
   min_child_weight: 9
   subsample: 0.6849356442713105
   colsample_bytree: 0.6727299868828402
   gamma: 0.9170225492671691
   reg_alpha: 1.5212112147976886
   reg_lambda: 2.6237821581611893
   Best validation AUC: 0.9304

4. Training final XGBoost model with best parameters...
[0]	train-auc:0.86570	validation-auc:0.85459
[10]	train-auc:0.91430	validation-auc:0.90170
[20]	train-auc:0.92062	validation-auc:0.90936





[30]	train-auc:0.92215	validation-auc:0.91053
[40]	train-auc:0.92419	validation-auc:0.91213
[50]	train-auc:0.92633	validation-auc:0.91387
[60]	train-auc:0.92735	validation-auc:0.91424
[70]	train-auc:0.92875	validation-auc:0.91511
[80]	train-auc:0.93011	validation-auc:0.91594
[90]	train-auc:0.93136	validation-auc:0.91661
[100]	train-auc:0.93229	validation-auc:0.91729
[110]	train-auc:0.93325	validation-auc:0.91763
[120]	train-auc:0.93418	validation-auc:0.91813
[130]	train-auc:0.93519	validation-auc:0.91891
[140]	train-auc:0.93610	validation-auc:0.91956
[150]	train-auc:0.93682	validation-auc:0.91995
[160]	train-auc:0.93762	validation-auc:0.92057
[170]	train-auc:0.93825	validation-auc:0.92104
[180]	train-auc:0.93898	validation-auc:0.92147
[190]	train-auc:0.93969	validation-auc:0.92200
[200]	train-auc:0.94046	validation-auc:0.92240
[210]	train-auc:0.94117	validation-auc:0.92258
[220]	train-auc:0.94191	validation-auc:0.92295
[230]	train-auc:0.94251	validation-auc:0.92339
[240]	train-auc:0.94

In [34]:
# Evaluate ALL H2O AutoML models + XGBoost
print('='*70)
print('COMPREHENSIVE MODEL EVALUATION')
print('='*70)

all_results = []

# Evaluate H2O AutoML models
if RUN_AUTOML and aml is not None:
    print()
    print('*** Evaluating H2O AutoML Models ***')
    leaderboard = aml.leaderboard.as_data_frame()
    print(f'Models from H2O AutoML: {len(leaderboard)}')

    for idx, model_id in enumerate(leaderboard['model_id'], 1):
        if idx <= 5 or idx > len(leaderboard) - 5:
            print(f'  Model {idx}/{len(leaderboard)}: {model_id}')
        elif idx == 6:
            print(f'  ... processing {len(leaderboard) - 10} models ...')

        model = h2o.get_model(model_id)
        perf = model.model_performance(valid=True)

        auc = perf.auc()
        acc = perf.accuracy()[0][1] if perf.accuracy() else None
        f1 = perf.F1()[0][1] if perf.F1() else None
        precision = perf.precision()[0][1] if perf.precision() else None
        recall = perf.recall()[0][1] if perf.recall() else None
        logloss = perf.logloss()

        model_type = model_id.split('_')[0]

        all_results.append({
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'source': 'H2O_AutoML',
            'model_type': model_type,
            'model_name': model_id,
            'auc': auc,
            'accuracy': acc,
            'f1_score': f1,
            'precision': precision,
            'recall': recall,
            'logloss': logloss,
            'rank_in_source': idx
        })
else:
    print()
    print('Skipping H2O AutoML evaluation (not run)')

# Add XGBoost result
if RUN_XGBOOST and xgb_metrics is not None:
    print()
    print('*** Adding XGBoost Model ***')
    xgb_metrics['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    xgb_metrics['source'] = 'XGBoost_Optuna'
    xgb_metrics['model_name'] = 'XGBoost_Optimized'
    xgb_metrics['rank_in_source'] = 1
    all_results.append(xgb_metrics)
    print(f"  XGBoost AUC: {xgb_metrics['auc']:.4f}")
else:
    print()
    print('Skipping XGBoost evaluation (not run)')

# Combine with baseline results
all_results_combined = pd.concat([baseline_results, pd.DataFrame(all_results)], ignore_index=True)
all_results_combined = all_results_combined.sort_values('auc', ascending=False).reset_index(drop=True)
all_results_combined['overall_rank'] = range(1, len(all_results_combined) + 1)

# Save results
results_file = f'results/model_logs_complete_{timestamp_str}.csv'
all_results_combined.to_csv(results_file, index=False)

# Save just H2O AutoML
h2o_results = all_results_combined[all_results_combined['source'] == 'H2O_AutoML']
h2o_results.to_csv(f'results/model_logs_h2o_automl_{timestamp_str}.csv', index=False)

# Save just XGBoost
xgb_results = all_results_combined[all_results_combined['source'] == 'XGBoost_Optuna']
xgb_results.to_csv(f'results/model_logs_xgboost_{timestamp_str}.csv', index=False)

print()
print('='*70)
print('RESULTS SUMMARY')
print('='*70)
print(f'Total models evaluated: {len(all_results_combined)}')
print(f'  - Baseline: {len(baseline_results)}')
print(f'  - H2O AutoML: {len(h2o_results)}')
print(f'  - XGBoost: {len(xgb_results)}')
print()
print('Files saved:')
print(f'  - {results_file} (ALL models)')
print(f'  - results/model_logs_h2o_automl_{timestamp_str}.csv')
print(f'  - results/model_logs_xgboost_{timestamp_str}.csv')

print()
print('*** TOP 15 MODELS OVERALL (by AUC) ***')
display(all_results_combined[['overall_rank', 'source', 'model_type', 'model_name', 'auc', 'accuracy', 'f1_score']].head(15))

print()
print('*** BEST MODEL FROM EACH SOURCE ***')
for source in ['baseline', 'H2O_AutoML', 'XGBoost_Optuna']:
    source_results = all_results_combined[all_results_combined['source'] == source]
    if len(source_results) > 0:
        best = source_results.iloc[0]
        print()
        print(f"{source}:")
        print(f"  Rank: #{best['overall_rank']}")
        print(f"  Model: {best['model_name']}")
        print(f"  AUC: {best['auc']:.4f}")
        print(f"  Accuracy: {best['accuracy']:.4f}")

all_results_combined.head(20)


COMPREHENSIVE MODEL EVALUATION

*** Evaluating H2O AutoML Models ***
Models from H2O AutoML: 22
  Model 1/22: StackedEnsemble_AllModels_1_AutoML_1_20260201_163031
  Model 2/22: StackedEnsemble_BestOfFamily_1_AutoML_1_20260201_163031
  Model 3/22: GBM_5_AutoML_1_20260201_163031
  Model 4/22: GBM_1_AutoML_1_20260201_163031
  Model 5/22: GBM_2_AutoML_1_20260201_163031
  ... processing 12 models ...





  Model 18/22: DeepLearning_grid_2_AutoML_1_20260201_163031_model_2
  Model 19/22: DeepLearning_grid_3_AutoML_1_20260201_163031_model_2
  Model 20/22: GBM_grid_1_AutoML_1_20260201_163031_model_3
  Model 21/22: XRT_1_AutoML_1_20260201_163031
  Model 22/22: GLM_1_AutoML_1_20260201_163031

*** Adding XGBoost Model ***
  XGBoost AUC: 0.9304

RESULTS SUMMARY
Total models evaluated: 26
  - Baseline: 3
  - H2O AutoML: 22
  - XGBoost: 1

Files saved:
  - results/model_logs_complete_20260201_163020.csv (ALL models)
  - results/model_logs_h2o_automl_20260201_163020.csv
  - results/model_logs_xgboost_20260201_163020.csv

*** TOP 15 MODELS OVERALL (by AUC) ***


Unnamed: 0,overall_rank,source,model_type,model_name,auc,accuracy,f1_score
0,1,H2O_AutoML,StackedEnsemble,StackedEnsemble_AllModels_1_AutoML_1_20260201_...,0.936405,0.887529,0.911656
1,2,H2O_AutoML,GBM,GBM_2_AutoML_1_20260201_163031,0.935718,0.887529,0.910512
2,3,baseline,GBM,GBM_baseline,0.935325,0.883364,0.908981
3,4,H2O_AutoML,GBM,GBM_grid_1_AutoML_1_20260201_163031_model_1,0.934196,0.882583,0.907453
4,5,H2O_AutoML,StackedEnsemble,StackedEnsemble_BestOfFamily_1_AutoML_1_202602...,0.933759,0.884405,0.909571
5,6,H2O_AutoML,GBM,GBM_1_AutoML_1_20260201_163031,0.933149,0.882843,0.908833
6,7,H2O_AutoML,GBM,GBM_5_AutoML_1_20260201_163031,0.932727,0.886228,0.909863
7,8,H2O_AutoML,GBM,GBM_4_AutoML_1_20260201_163031,0.931311,0.885446,0.911075
8,9,H2O_AutoML,GBM,GBM_grid_1_AutoML_1_20260201_163031_model_5,0.931226,0.885707,0.910277
9,10,H2O_AutoML,GBM,GBM_3_AutoML_1_20260201_163031,0.930746,0.884145,0.910264



*** BEST MODEL FROM EACH SOURCE ***

baseline:
  Rank: #3
  Model: GBM_baseline
  AUC: 0.9353
  Accuracy: 0.8834

H2O_AutoML:
  Rank: #1
  Model: StackedEnsemble_AllModels_1_AutoML_1_20260201_163031
  AUC: 0.9364
  Accuracy: 0.8875

XGBoost_Optuna:
  Rank: #11
  Model: XGBoost_Optimized
  AUC: 0.9304
  Accuracy: 0.8813


Unnamed: 0,timestamp,source,model_type,model_name,auc,accuracy,f1_score,precision,recall,logloss,rank_in_source,overall_rank
0,2026-02-01 16:54:35,H2O_AutoML,StackedEnsemble,StackedEnsemble_AllModels_1_AutoML_1_20260201_...,0.936405,0.887529,0.911656,1.0,1.0,0.280824,1.0,1
1,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_2_AutoML_1_20260201_163031,0.935718,0.887529,0.910512,1.0,1.0,0.285363,5.0,2
2,2026-02-01 16:30:26,baseline,GBM,GBM_baseline,0.935325,0.883364,0.908981,1.0,1.0,0.287731,,3
3,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_grid_1_AutoML_1_20260201_163031_model_1,0.934196,0.882583,0.907453,1.0,1.0,0.29235,6.0,4
4,2026-02-01 16:54:35,H2O_AutoML,StackedEnsemble,StackedEnsemble_BestOfFamily_1_AutoML_1_202602...,0.933759,0.884405,0.909571,1.0,1.0,0.286647,2.0,5
5,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_1_AutoML_1_20260201_163031,0.933149,0.882843,0.908833,1.0,1.0,0.292054,4.0,6
6,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_5_AutoML_1_20260201_163031,0.932727,0.886228,0.909863,1.0,1.0,0.291905,3.0,7
7,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_4_AutoML_1_20260201_163031,0.931311,0.885446,0.911075,1.0,1.0,0.298327,7.0,8
8,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_grid_1_AutoML_1_20260201_163031_model_5,0.931226,0.885707,0.910277,1.0,1.0,0.295183,8.0,9
9,2026-02-01 16:54:35,H2O_AutoML,GBM,GBM_3_AutoML_1_20260201_163031,0.930746,0.884145,0.910264,1.0,1.0,0.29404,10.0,10
