In [1]:
import torch
import pandas as pd
import numpy as np
import glob
import os

# Fix for PyTorch 2.6+ compatibility with OGB dataset loading
# Patch torch.load to use weights_only=False for compatibility with torch_geometric
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
    # Set weights_only=False if not explicitly provided (for PyTorch 2.6+)
    if 'weights_only' not in kwargs:
        kwargs['weights_only'] = False
    return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load

# Now import OGB after patching torch.load
from ogb.nodeproppred import PygNodePropPredDataset

# Configuration
dataset_name = 'products'
predictions_dir = 'predictions'
models_dir = 'models'
labels_csv = 'gcn_predictions_2.csv'
gcn_pt_file = 'gcn_outputs_2.pt'  # Fallback GCN predictions as .pt tensor file


  from pkg_resources import parse_version


In [2]:
# Load dataset and get split indices (like run_experiments.py)
print("Loading dataset and split indices...")
dataset = PygNodePropPredDataset(name=f'ogbn-{dataset_name}')
data = dataset[0]
split_idx = dataset.get_idx_split()
test_idx = split_idx['test'].numpy()
print(f"Test set size: {len(test_idx)} nodes")

# Load true labels
print("\nLoading true labels...")
labels_df = pd.read_csv(labels_csv)
labels_df = labels_df.sort_values('node_id').reset_index(drop=True)
true_labels = labels_df['true_label'].values
num_nodes = len(true_labels)
print(f"Loaded {num_nodes} total nodes")
print(f"True labels shape: {true_labels.shape}")
print(f"Test indices range: {test_idx.min()} to {test_idx.max()}")


Loading dataset and split indices...
Test set size: 2213091 nodes

Loading true labels...
Loaded 2449029 total nodes
True labels shape: (2449029,)
Test indices range: 235938 to 2449028


In [3]:
# Load all prediction CSV files
methods = ['plain', 'linear', 'mlp']
all_predictions = {}

for method in methods:
    method_dir = f'{predictions_dir}/{dataset_name}_{method}'
    pred_files = glob.glob(f'{method_dir}/run*_predictions.csv')
    pred_files.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0].replace('run', '').replace('_predictions', '')))
    
    for pred_file in pred_files:
        run_num = int(os.path.splitext(os.path.basename(pred_file))[0].replace('run', '').replace('_predictions', ''))
        key = f'{method}_run{run_num}'
        df = pd.read_csv(pred_file)
        df = df.sort_values('node_id').reset_index(drop=True)
        all_predictions[key] = df
        print(f"Loaded {key}: {len(df)} predictions")

# Load GCN predictions from .pt files (similar to other models)
print(f"\nLoading GCN predictions from .pt files...")
gcn_model_dir = f'{models_dir}/{dataset_name}_gcn'
gcn_pt_files = []

# Try to load from models directory first
if os.path.exists(gcn_model_dir):
    gcn_pt_files = glob.glob(f'{gcn_model_dir}/*.pt')
    gcn_pt_files.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0]) if os.path.splitext(os.path.basename(x))[0].isdigit() else 0)
    print(f"Found {len(gcn_pt_files)} GCN .pt files in {gcn_model_dir}")
else:
    print(f"Models directory {gcn_model_dir} not found, checking for fallback file...")

# If no files in models directory, try fallback file
if len(gcn_pt_files) == 0 and os.path.exists(gcn_pt_file):
    print(f"Using fallback file: {gcn_pt_file}")
    gcn_pt_files = [gcn_pt_file]

# Load top 3 GCN runs (or all available if less than 3)
num_gcn_runs = min(3, len(gcn_pt_files))
gcn_pt_files = gcn_pt_files[:num_gcn_runs]

for i, gcn_pt_file in enumerate(gcn_pt_files):
    # Extract run number from filename, or use index
    filename_base = os.path.splitext(os.path.basename(gcn_pt_file))[0]
    try:
        run_num = int(filename_base)
    except ValueError:
        # If filename is not a number (e.g., "gcn_outputs_2"), use index
        run_num = i
    key = f'gcn_run{run_num}'
    
    # Load predictions from .pt file
    gcn_predictions = torch.load(gcn_pt_file, map_location='cpu')  # Shape: [num_nodes, num_classes]
    
    # Get top 3 predictions for each node
    top3_values, top3_indices = torch.topk(gcn_predictions, k=3, dim=1)
    top3_indices_np = top3_indices.numpy()  # Shape: [num_nodes, 3]
    
    # Create DataFrame in the same format as other models
    gcn_pred_df = pd.DataFrame({
        'node_id': np.arange(len(gcn_predictions)),
        'prediction_1': top3_indices_np[:, 0],
        'prediction_2': top3_indices_np[:, 1],
        'prediction_3': top3_indices_np[:, 2],
        'truth': true_labels
    })
    
    all_predictions[key] = gcn_pred_df
    print(f"Loaded {key}: {len(gcn_pred_df)} predictions (with top 3 predictions)")

# Load C&S (Correct and Smooth) predictions from ogbn-products_final directory
print(f"\nLoading C&S predictions from ogbn-products_final/...")
cs_dir = 'ogbn-products_final'
cs_pt_files = glob.glob(f'{cs_dir}/*_cs.pt')
cs_pt_files.sort()  # Sort alphabetically for consistent ordering

print(f"Found {len(cs_pt_files)} C&S .pt files")

for i, cs_pt_file in enumerate(cs_pt_files):
    # Extract a short identifier from the filename (first part of hash)
    filename = os.path.basename(cs_pt_file)
    # Use first 8 chars of hash as identifier, or just use index
    hash_part = filename.split('_')[0]
    key = f'gamlp_cs_{hash_part[:8]}'
    
    # Load predictions from .pt file
    cs_predictions = torch.load(cs_pt_file, map_location='cpu')  # Shape: [num_nodes, num_classes]
    
    # Get top 3 predictions for each node
    top3_values, top3_indices = torch.topk(cs_predictions, k=3, dim=1)
    top3_indices_np = top3_indices.numpy()  # Shape: [num_nodes, 3]
    
    # Create DataFrame in the same format as other models
    cs_pred_df = pd.DataFrame({
        'node_id': np.arange(len(cs_predictions)),
        'prediction_1': top3_indices_np[:, 0],
        'prediction_2': top3_indices_np[:, 1],
        'prediction_3': top3_indices_np[:, 2],
        'truth': true_labels
    })
    
    all_predictions[key] = cs_pred_df
    print(f"Loaded {key}: {len(cs_pred_df)} predictions (with top 3 predictions)")

# Load non-C&S GAMLP predictions from ogbn-products_final directory
print(f"\nLoading GAMLP (non-C&S) predictions from ogbn-products_final/...")
gamlp_pt_files = glob.glob(f'{cs_dir}/*.pt')
# Filter out _cs files to get only non-C&S files
gamlp_pt_files = [f for f in gamlp_pt_files if '_cs.pt' not in f]
gamlp_pt_files.sort()  # Sort alphabetically for consistent ordering

print(f"Found {len(gamlp_pt_files)} GAMLP (non-C&S) .pt files")

for i, gamlp_pt_file in enumerate(gamlp_pt_files):
    # Extract a short identifier from the filename (first part of hash)
    filename = os.path.basename(gamlp_pt_file)
    # Use first 8 chars of hash as identifier
    hash_part = filename.split('_')[0]
    key = f'gamlp_{hash_part[:8]}'
    
    # Load predictions from .pt file
    gamlp_predictions = torch.load(gamlp_pt_file, map_location='cpu')  # Shape: [num_nodes, num_classes]
    
    # Get top 3 predictions for each node
    top3_values, top3_indices = torch.topk(gamlp_predictions, k=3, dim=1)
    top3_indices_np = top3_indices.numpy()  # Shape: [num_nodes, 3]
    
    # Create DataFrame in the same format as other models
    gamlp_pred_df = pd.DataFrame({
        'node_id': np.arange(len(gamlp_predictions)),
        'prediction_1': top3_indices_np[:, 0],
        'prediction_2': top3_indices_np[:, 1],
        'prediction_3': top3_indices_np[:, 2],
        'truth': true_labels
    })
    
    all_predictions[key] = gamlp_pred_df
    print(f"Loaded {key}: {len(gamlp_pred_df)} predictions (with top 3 predictions)")

print(f"\nTotal models loaded: {len(all_predictions)}")


Loaded plain_run0: 2449029 predictions
Loaded plain_run1: 2449029 predictions
Loaded plain_run2: 2449029 predictions
Loaded plain_run3: 2449029 predictions
Loaded plain_run4: 2449029 predictions
Loaded linear_run0: 2449029 predictions
Loaded linear_run1: 2449029 predictions
Loaded linear_run2: 2449029 predictions
Loaded linear_run3: 2449029 predictions
Loaded linear_run4: 2449029 predictions
Loaded mlp_run0: 2449029 predictions
Loaded mlp_run1: 2449029 predictions
Loaded mlp_run2: 2449029 predictions
Loaded mlp_run3: 2449029 predictions
Loaded mlp_run4: 2449029 predictions

Loading GCN predictions from .pt files...
Models directory models/products_gcn not found, checking for fallback file...
Using fallback file: gcn_outputs_2.pt
Loaded gcn_run0: 2449029 predictions (with top 3 predictions)

Loading C&S predictions from ogbn-products_final/...
Found 5 C&S .pt files
Loaded gamlp_cs_0919abde: 2449029 predictions (with top 3 predictions)
Loaded gamlp_cs_09883da0: 2449029 predictions (with 

In [4]:
# Calculate individual accuracy scores (ONLY ON TEST SET)
# This must be done before selecting best models
print("=" * 60)
print("INDIVIDUAL MODEL ACCURACIES (TEST SET ONLY)")
print("=" * 60)

# Calculate if not already calculated
if 'individual_accuracies' not in globals():
    individual_accuracies = {}

    # Get test set labels
    test_labels = true_labels[test_idx]

    for key, df in all_predictions.items():
        # Get first prediction (prediction_1) and compare to truth
        predictions = df['prediction_1'].values
        # Only evaluate on test nodes
        test_predictions = predictions[test_idx]
        correct = (test_predictions == test_labels).sum()
        accuracy = correct / len(test_idx)
        individual_accuracies[key] = accuracy
        print(f"{key:20s}: {accuracy:.4f} ({accuracy*100:.2f}%)")

    print("=" * 60)
    print(f"Average individual accuracy: {np.mean(list(individual_accuracies.values())):.4f} ({np.mean(list(individual_accuracies.values()))*100:.2f}%)")
    print(f"Std dev: {np.std(list(individual_accuracies.values())):.4f}")
    print(f"Test set size: {len(test_idx)} nodes")
else:
    print("Individual accuracies already calculated, skipping...")

# Select best model from each category
print("\n" + "=" * 60)
print("SELECTING BEST MODEL FROM EACH CATEGORY")
print("=" * 60)

# Define categories and their prefixes
categories = {
    'plain': 'plain_run',
    'linear': 'linear_run',
    'mlp': 'mlp_run',
    'gcn': 'gcn_run',
    'gamlp_cs': 'gamlp_cs_',
    'gamlp': 'gamlp_'
}

best_models = {}
best_model_keys = []

for category, prefix in categories.items():
    # Find all models in this category
    # Special handling: exclude gamlp_cs_ when processing gamlp
    if category == 'gamlp':
        category_models = {k: v for k, v in individual_accuracies.items() if k.startswith(prefix) and not k.startswith('gamlp_cs_')}
    else:
        category_models = {k: v for k, v in individual_accuracies.items() if k.startswith(prefix)}
    
    if len(category_models) > 0:
        # Find the best model (highest accuracy)
        best_key = max(category_models.items(), key=lambda x: x[1])[0]
        best_accuracy = category_models[best_key]
        best_models[category] = best_key
        best_model_keys.append(best_key)
        print(f"{category:15s}: {best_key:25s} (accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%))")
    else:
        print(f"{category:15s}: No models found")

print(f"\nSelected {len(best_model_keys)} best models: {best_model_keys}")

# Create filtered predictions dictionary with only best models
best_predictions = {key: all_predictions[key] for key in best_model_keys}
print(f"Created filtered predictions dictionary with {len(best_predictions)} models")


INDIVIDUAL MODEL ACCURACIES (TEST SET ONLY)
plain_run0          : 0.8253 (82.53%)
plain_run1          : 0.8254 (82.54%)
plain_run2          : 0.8248 (82.48%)
plain_run3          : 0.8250 (82.50%)
plain_run4          : 0.8249 (82.49%)
linear_run0         : 0.8296 (82.96%)
linear_run1         : 0.8297 (82.97%)
linear_run2         : 0.8294 (82.94%)
linear_run3         : 0.8297 (82.97%)
linear_run4         : 0.8297 (82.97%)
mlp_run0            : 0.8411 (84.11%)
mlp_run1            : 0.8421 (84.21%)
mlp_run2            : 0.8422 (84.22%)
mlp_run3            : 0.8422 (84.22%)
mlp_run4            : 0.8411 (84.11%)
gcn_run0            : 0.8539 (85.39%)
gamlp_cs_0919abde   : 0.8515 (85.15%)
gamlp_cs_09883da0   : 0.8519 (85.19%)
gamlp_cs_9cd3b467   : 0.8510 (85.10%)
gamlp_cs_c24dfceb   : 0.8514 (85.14%)
gamlp_cs_eddd8eb7   : 0.8508 (85.08%)
gamlp_0919abde      : 0.8505 (85.05%)
gamlp_09883da0      : 0.8508 (85.08%)
gamlp_9cd3b467      : 0.8497 (84.97%)
gamlp_c24dfceb      : 0.8501 (85.01%)
gamlp_

In [5]:
# This cell is now combined with Cell 3 above
# Individual accuracies are calculated in Cell 3 before best model selection
# This cell can be skipped or used for additional analysis if needed

print("Note: Individual accuracies are now calculated in Cell 3 before best model selection.")
print("If you need to recalculate, run Cell 3 again.")


Note: Individual accuracies are now calculated in Cell 3 before best model selection.
If you need to recalculate, run Cell 3 again.


In [6]:
# Calculate ensemble accuracy (correct if AT LEAST ONE model got it right) - TEST SET ONLY
print("\n" + "=" * 60)
print("ENSEMBLE ACCURACY (At least one model correct) - TEST SET ONLY")
print("=" * 60)

# Create a matrix: [num_test_nodes, num_models] where each entry is True if that model got it right
test_labels = true_labels[test_idx]
correct_matrix = np.zeros((len(test_idx), len(all_predictions)), dtype=bool)

model_keys = sorted(all_predictions.keys())
for i, key in enumerate(model_keys):
    df = all_predictions[key]
    predictions = df['prediction_1'].values
    # Only evaluate on test nodes
    test_predictions = predictions[test_idx]
    correct_matrix[:, i] = (test_predictions == test_labels)

# For each test node, check if at least one model got it correct
ensemble_correct = correct_matrix.any(axis=1)
ensemble_accuracy = ensemble_correct.sum() / len(test_idx)

print(f"Ensemble accuracy: {ensemble_accuracy:.4f} ({ensemble_accuracy*100:.2f}%)")
print(f"Number of test nodes where at least one model is correct: {ensemble_correct.sum()} / {len(test_idx)}")
print(f"Number of test nodes where ALL models are wrong: {(~ensemble_correct).sum()} / {len(test_idx)}")



ENSEMBLE ACCURACY (At least one model correct) - TEST SET ONLY
Ensemble accuracy: 0.9167 (91.67%)
Number of test nodes where at least one model is correct: 2028837 / 2213091
Number of test nodes where ALL models are wrong: 184254 / 2213091


In [7]:
# Additional analysis: How many models agree on each test node?
print("\n" + "=" * 60)
print("MODEL AGREEMENT ANALYSIS (TEST SET ONLY)")
print("=" * 60)

# Count how many models got each test node correct
num_models_correct_per_node = correct_matrix.sum(axis=1)
num_models = len(all_predictions)

print(f"Test nodes where 0 models correct: {(num_models_correct_per_node == 0).sum()}")
print(f"Test nodes where 1 model correct: {(num_models_correct_per_node == 1).sum()}")
print(f"Test nodes where 2-5 models correct: {((num_models_correct_per_node >= 2) & (num_models_correct_per_node <= 5)).sum()}")
print(f"Test nodes where 6-10 models correct: {((num_models_correct_per_node >= 6) & (num_models_correct_per_node <= 10)).sum()}")
print(f"Test nodes where 11-15 models correct: {((num_models_correct_per_node >= 11) & (num_models_correct_per_node <= 15)).sum()}")
if num_models > 15:
    print(f"Test nodes where 16+ models correct: {(num_models_correct_per_node >= 16).sum()}")
print(f"Test nodes where ALL {num_models} models correct: {(num_models_correct_per_node == num_models).sum()}")

print(f"\nAverage number of models correct per test node: {num_models_correct_per_node.mean():.2f}")
print(f"Median number of models correct per test node: {np.median(num_models_correct_per_node):.0f}")
print(f"Total test nodes analyzed: {len(test_idx)}")



MODEL AGREEMENT ANALYSIS (TEST SET ONLY)
Test nodes where 0 models correct: 184254
Test nodes where 1 model correct: 62714
Test nodes where 2-5 models correct: 26715
Test nodes where 6-10 models correct: 39304
Test nodes where 11-15 models correct: 57258
Test nodes where 16+ models correct: 1842846
Test nodes where ALL 26 models correct: 1665271

Average number of models correct per test node: 21.84
Median number of models correct per test node: 26
Total test nodes analyzed: 2213091


In [8]:
# ============================================================================
# ANALYSIS WITH BEST MODELS ONLY (6 models total)
# ============================================================================

print("\n\n" + "=" * 80)
print("=" * 80)
print("ANALYSIS WITH BEST MODELS ONLY (6 MODELS)")
print("=" * 80)
print("=" * 80)

# Calculate individual accuracy scores for best models (ONLY ON TEST SET)
print("\n" + "=" * 60)
print("INDIVIDUAL MODEL ACCURACIES - BEST MODELS (TEST SET ONLY)")
print("=" * 60)

best_individual_accuracies = {}

# Get test set labels
test_labels = true_labels[test_idx]

for key, df in best_predictions.items():
    # Get first prediction (prediction_1) and compare to truth
    predictions = df['prediction_1'].values
    # Only evaluate on test nodes
    test_predictions = predictions[test_idx]
    correct = (test_predictions == test_labels).sum()
    accuracy = correct / len(test_idx)
    best_individual_accuracies[key] = accuracy
    print(f"{key:25s}: {accuracy:.4f} ({accuracy*100:.2f}%)")

print("=" * 60)
print(f"Average individual accuracy: {np.mean(list(best_individual_accuracies.values())):.4f} ({np.mean(list(best_individual_accuracies.values()))*100:.2f}%)")
print(f"Std dev: {np.std(list(best_individual_accuracies.values())):.4f}")
print(f"Test set size: {len(test_idx)} nodes")




ANALYSIS WITH BEST MODELS ONLY (6 MODELS)

INDIVIDUAL MODEL ACCURACIES - BEST MODELS (TEST SET ONLY)
plain_run1               : 0.8254 (82.54%)
linear_run3              : 0.8297 (82.97%)
mlp_run2                 : 0.8422 (84.22%)
gcn_run0                 : 0.8539 (85.39%)
gamlp_cs_09883da0        : 0.8519 (85.19%)
gamlp_09883da0           : 0.8508 (85.08%)
Average individual accuracy: 0.8423 (84.23%)
Std dev: 0.0111
Test set size: 2213091 nodes


In [9]:
# Calculate ensemble accuracy for best models (correct if AT LEAST ONE model got it right) - TEST SET ONLY
print("\n" + "=" * 60)
print("ENSEMBLE ACCURACY - BEST MODELS (At least one model correct) - TEST SET ONLY")
print("=" * 60)

# Create a matrix: [num_test_nodes, num_models] where each entry is True if that model got it right
test_labels = true_labels[test_idx]
best_correct_matrix = np.zeros((len(test_idx), len(best_predictions)), dtype=bool)

best_model_keys_sorted = sorted(best_predictions.keys())
for i, key in enumerate(best_model_keys_sorted):
    df = best_predictions[key]
    predictions = df['prediction_1'].values
    # Only evaluate on test nodes
    test_predictions = predictions[test_idx]
    best_correct_matrix[:, i] = (test_predictions == test_labels)

# For each test node, check if at least one model got it correct
best_ensemble_correct = best_correct_matrix.any(axis=1)
best_ensemble_accuracy = best_ensemble_correct.sum() / len(test_idx)

print(f"Ensemble accuracy: {best_ensemble_accuracy:.4f} ({best_ensemble_accuracy*100:.2f}%)")
print(f"Number of test nodes where at least one model is correct: {best_ensemble_correct.sum()} / {len(test_idx)}")
print(f"Number of test nodes where ALL models are wrong: {(~best_ensemble_correct).sum()} / {len(test_idx)}")



ENSEMBLE ACCURACY - BEST MODELS (At least one model correct) - TEST SET ONLY
Ensemble accuracy: 0.9120 (91.20%)
Number of test nodes where at least one model is correct: 2018351 / 2213091
Number of test nodes where ALL models are wrong: 194740 / 2213091


In [10]:
# Additional analysis: How many models agree on each test node? - BEST MODELS
print("\n" + "=" * 60)
print("MODEL AGREEMENT ANALYSIS - BEST MODELS (TEST SET ONLY)")
print("=" * 60)

# Count how many models got each test node correct
best_num_models_correct_per_node = best_correct_matrix.sum(axis=1)
best_num_models = len(best_predictions)

print(f"Test nodes where 0 models correct: {(best_num_models_correct_per_node == 0).sum()}")
print(f"Test nodes where 1 model correct: {(best_num_models_correct_per_node == 1).sum()}")
print(f"Test nodes where 2 models correct: {(best_num_models_correct_per_node == 2).sum()}")
print(f"Test nodes where 3 models correct: {(best_num_models_correct_per_node == 3).sum()}")
print(f"Test nodes where 4 models correct: {(best_num_models_correct_per_node == 4).sum()}")
print(f"Test nodes where 5 models correct: {(best_num_models_correct_per_node == 5).sum()}")
print(f"Test nodes where ALL {best_num_models} models correct: {(best_num_models_correct_per_node == best_num_models).sum()}")

print(f"\nAverage number of models correct per test node: {best_num_models_correct_per_node.mean():.2f}")
print(f"Median number of models correct per test node: {np.median(best_num_models_correct_per_node):.0f}")
print(f"Total test nodes analyzed: {len(test_idx)}")



MODEL AGREEMENT ANALYSIS - BEST MODELS (TEST SET ONLY)
Test nodes where 0 models correct: 194740
Test nodes where 1 model correct: 77975
Test nodes where 2 models correct: 39385
Test nodes where 3 models correct: 51817
Test nodes where 4 models correct: 62596
Test nodes where 5 models correct: 97146
Test nodes where ALL 6 models correct: 1689432

Average number of models correct per test node: 5.05
Median number of models correct per test node: 6
Total test nodes analyzed: 2213091


In [11]:
# Summary comparison - BEST MODELS
print("\n" + "=" * 60)
print("SUMMARY - BEST MODELS (TEST SET ONLY)")
print("=" * 60)
print(f"Total number of models: {len(best_predictions)}")
print(f"Test set size: {len(test_idx)} nodes")
print(f"Individual model accuracy range: {min(best_individual_accuracies.values()):.4f} - {max(best_individual_accuracies.values()):.4f}")
print(f"Average individual accuracy: {np.mean(list(best_individual_accuracies.values())):.4f}")
print(f"Ensemble accuracy (at least one correct): {best_ensemble_accuracy:.4f}")
print(f"Improvement over average: {best_ensemble_accuracy - np.mean(list(best_individual_accuracies.values())):.4f} ({((best_ensemble_accuracy / np.mean(list(best_individual_accuracies.values())) - 1) * 100):.2f}%)")
print(f"Improvement over best individual: {best_ensemble_accuracy - max(best_individual_accuracies.values()):.4f} ({((best_ensemble_accuracy / max(best_individual_accuracies.values()) - 1) * 100):.2f}%)")



SUMMARY - BEST MODELS (TEST SET ONLY)
Total number of models: 6
Test set size: 2213091 nodes
Individual model accuracy range: 0.8254 - 0.8539
Average individual accuracy: 0.8423
Ensemble accuracy (at least one correct): 0.9120
Improvement over average: 0.0697 (8.27%)
Improvement over best individual: 0.0581 (6.81%)


In [12]:
# Summary comparison (TEST SET ONLY)
print("\n" + "=" * 60)
print("SUMMARY (TEST SET ONLY)")
print("=" * 60)
print(f"Total number of models: {len(all_predictions)}")
print(f"Test set size: {len(test_idx)} nodes")
print(f"Individual model accuracy range: {min(individual_accuracies.values()):.4f} - {max(individual_accuracies.values()):.4f}")
print(f"Average individual accuracy: {np.mean(list(individual_accuracies.values())):.4f}")
print(f"Ensemble accuracy (at least one correct): {ensemble_accuracy:.4f}")
print(f"Improvement over average: {ensemble_accuracy - np.mean(list(individual_accuracies.values())):.4f} ({((ensemble_accuracy / np.mean(list(individual_accuracies.values())) - 1) * 100):.2f}%)")
print(f"Improvement over best individual: {ensemble_accuracy - max(individual_accuracies.values()):.4f} ({((ensemble_accuracy / max(individual_accuracies.values()) - 1) * 100):.2f}%)")



SUMMARY (TEST SET ONLY)
Total number of models: 26
Test set size: 2213091 nodes
Individual model accuracy range: 0.8248 - 0.8539
Average individual accuracy: 0.8401
Ensemble accuracy (at least one correct): 0.9167
Improvement over average: 0.0766 (9.12%)
Improvement over best individual: 0.0629 (7.36%)
