# Ensemble Comparison: HYDRA + QUANT Stacking

**Objective**: Compare the new clean stacked ensemble (with proper cross-validation) against:
- Individual QUANT and HYDRA algorithms
- Old stacked ensemble (with data leakage)

**Methodology**:
- Multiple hyperparameter configurations
- Statistical analysis and comparison
- Runtime and accuracy metrics

**Dataset**: Configurable (default: Pedestrian)

In [1]:
# CONFIGURATION
DATASET_NAME = "Pedestrian"  # Change this to test other datasets
TRAIN_PCT = 10.0             # Percentage of training data (0-100 scale)
TEST_PCT = 50.0              # Percentage of test data (0-100 scale)
RANDOM_SEED = 42             # For reproducibility

# Hyperparameter configurations to test
CONFIGS = {
    "fast": {
        "hydra_k": 4,
        "hydra_g": 16,
        "n_estimators": 50,
        "n_folds": 3
    },
    "balanced": {
        "hydra_k": 8,
        "hydra_g": 64,
        "n_estimators": 100,
        "n_folds": 5
    }
    # Note: Removed "accurate" config as it's too slow for testing
}

print(f"Configuration loaded: {DATASET_NAME} dataset")
print(f"Training: {TRAIN_PCT}%, Testing: {TEST_PCT}%")
print(f"Testing {len(CONFIGS)} hyperparameter configurations")

Configuration loaded: Pedestrian dataset
Training: 10.0%, Testing: 50.0%
Testing 2 hyperparameter configurations


In [2]:
# Setup paths BEFORE importing tsckit
import sys
import time
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple

sys.path.extend([
    '/Users/urav/code/research',
    '/Users/urav/code/research/quant/code',
    '/Users/urav/code/research/hydra/code',
    '/Users/urav/code/research/aaltd2024/code',
])

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

print("✓ Libraries and paths configured")

✓ Libraries and paths configured


In [3]:
# Import algorithms
from tsckit import (
    MonsterDataset,
    QuantAALTD2024,
    HydraAALTD2024,
    HydraQuantStackedAALTD2024,  # Old ensemble (data leakage)
    HydraQuantStacked            # New clean ensemble
)

print("✓ Algorithm imports successful")

  from .autonotebook import tqdm as notebook_tqdm


✓ Algorithm imports successful


In [4]:
# Load dataset
print(f"Loading {DATASET_NAME} dataset...")
dataset = MonsterDataset(DATASET_NAME, fold=0, train_pct=TRAIN_PCT, test_pct=TEST_PCT)
print(dataset.info())

# Get ground truth for accuracy calculation
_, y_test = dataset.get_arrays("test")
print(f"Test samples: {len(y_test)}, Classes: {len(np.unique(y_test))}")

Loading Pedestrian dataset...
Pedestrian (fold 0):
  Shape: 1 channels x 24 time points
  Classes: 82
  Total samples: 189621
  Train samples: 15169 (10.0%)
  Test samples: 18962 (50.0%)
Test samples: 18962, Classes: 82


In [5]:
# Helper function to run and time algorithms
def evaluate_algorithm(algorithm, name: str, config_name: str) -> Dict:
    """Run algorithm and return performance metrics."""
    print(f"\n🔄 Testing {name} ({config_name})...")
    
    try:
        # Training
        train_start = time.time()
        algorithm.fit(dataset)
        train_time = time.time() - train_start
        
        # Testing
        test_start = time.time()
        predictions = algorithm.predict(dataset)
        test_time = time.time() - test_start
        
        # Accuracy
        accuracy = np.mean(predictions == y_test)
        
        print(f"   ✅ {name}: {accuracy:.4f} accuracy, {train_time:.2f}s train, {test_time:.2f}s test")
        
        return {
            "algorithm": name,
            "config": config_name,
            "accuracy": accuracy,
            "train_time": train_time,
            "test_time": test_time,
            "total_time": train_time + test_time,
            "status": "success"
        }
        
    except Exception as e:
        print(f"   ❌ {name} FAILED: {str(e)}")
        return {
            "algorithm": name,
            "config": config_name,
            "accuracy": 0.0,
            "train_time": 0.0,
            "test_time": 0.0,
            "total_time": 0.0,
            "status": "failed",
            "error": str(e)
        }

print("✓ Evaluation function defined")

✓ Evaluation function defined


In [6]:
# Run comprehensive evaluation
results = []

print("🚀 Starting comprehensive evaluation...\n")
print("="*80)

for config_name, config in CONFIGS.items():
    print(f"\n📋 Configuration: {config_name.upper()}")
    print(f"   HYDRA: k={config['hydra_k']}, g={config['hydra_g']}")
    print(f"   QUANT: estimators={config['n_estimators']}")
    print(f"   Ensemble: folds={config['n_folds']}")
    print("-" * 60)
    
    # Individual algorithms
    quant = QuantAALTD2024(num_estimators=config['n_estimators'])
    results.append(evaluate_algorithm(quant, "QuantAALTD2024", config_name))
    
    hydra = HydraAALTD2024(k=config['hydra_k'], g=config['hydra_g'], seed=RANDOM_SEED)
    results.append(evaluate_algorithm(hydra, "HydraAALTD2024", config_name))
    
    # Old ensemble (data leakage)
    old_ensemble = HydraQuantStackedAALTD2024(
        hydra_k=config['hydra_k'],
        hydra_g=config['hydra_g'],
        hydra_seed=RANDOM_SEED,
        quant_estimators=config['n_estimators']
    )
    results.append(evaluate_algorithm(old_ensemble, "OldEnsemble", config_name))
    
    # New clean ensemble
    new_ensemble = HydraQuantStacked(
        n_folds=config['n_folds'],
        hydra_k=config['hydra_k'],
        hydra_g=config['hydra_g'],
        hydra_seed=RANDOM_SEED,
        n_estimators=config['n_estimators']
    )
    results.append(evaluate_algorithm(new_ensemble, "NewEnsemble", config_name))

print("\n🎯 Evaluation complete!")

🚀 Starting comprehensive evaluation...


📋 Configuration: FAST
   HYDRA: k=4, g=16
   QUANT: estimators=50
   Ensemble: folds=3
------------------------------------------------------------

🔄 Testing QuantAALTD2024 (fast)...
   ✅ QuantAALTD2024: 0.6845 accuracy, 0.78s train, 0.21s test

🔄 Testing HydraAALTD2024 (fast)...
   ✅ HydraAALTD2024: 0.4501 accuracy, 0.85s train, 1.17s test

🔄 Testing OldEnsemble (fast)...
   ✅ OldEnsemble: 0.6808 accuracy, 2.30s train, 0.94s test

🔄 Testing NewEnsemble (fast)...
Training ensemble: 15169 samples, 82 classes, 24 length
Processing fold 1/3
Fold 1: train=10112, val=5057, logits_shape=(5057, 82)
Processing fold 2/3
Fold 2: train=10113, val=5056, logits_shape=(5056, 82)
Processing fold 3/3
Fold 3: train=10113, val=5056, logits_shape=(5056, 82)
Training final HYDRA model on full dataset
Extracting QUANT features
Features: QUANT=(15169, 212), OOF_logits=(15169, 82)
Training meta-classifier: features=(15169, 294), labels=(15169,)
   ✅ NewEnsemble: 0.68

In [7]:
# Convert results to DataFrame for analysis
df = pd.DataFrame(results)
successful_df = df[df['status'] == 'success'].copy()

print("📊 RESULTS SUMMARY")
print("=" * 80)

if len(successful_df) == 0:
    print("❌ No successful runs to analyze!")
else:
    # Summary by algorithm
    print("\n🏆 PERFORMANCE BY ALGORITHM:")
    summary = successful_df.groupby('algorithm').agg({
        'accuracy': ['mean', 'std', 'max'],
        'total_time': ['mean', 'std']
    }).round(4)
    
    for algo in successful_df['algorithm'].unique():
        algo_data = successful_df[successful_df['algorithm'] == algo]
        acc_mean = algo_data['accuracy'].mean()
        acc_std = algo_data['accuracy'].std()
        acc_max = algo_data['accuracy'].max()
        time_mean = algo_data['total_time'].mean()
        
        print(f"{algo:20s}: {acc_mean:.4f} ± {acc_std:.4f} (max: {acc_max:.4f}) | {time_mean:.1f}s avg")
    
    # Best configuration per algorithm
    print("\n🎯 BEST CONFIGURATION PER ALGORITHM:")
    for algo in successful_df['algorithm'].unique():
        algo_data = successful_df[successful_df['algorithm'] == algo]
        best_idx = algo_data['accuracy'].idxmax()
        best_row = algo_data.loc[best_idx]
        print(f"{algo:20s}: {best_row['config']} config → {best_row['accuracy']:.4f}")
    
    # Ensemble comparison
    print("\n🔥 ENSEMBLE COMPARISON:")
    ensemble_algos = ['QuantAALTD2024', 'HydraAALTD2024', 'OldEnsemble', 'NewEnsemble']
    for algo in ensemble_algos:
        if algo in successful_df['algorithm'].values:
            best_acc = successful_df[successful_df['algorithm'] == algo]['accuracy'].max()
            print(f"{algo:20s}: {best_acc:.4f}")
    
    # Overall winner
    best_overall = successful_df.loc[successful_df['accuracy'].idxmax()]
    print(f"\n🏆 OVERALL WINNER: {best_overall['algorithm']} ({best_overall['config']}) → {best_overall['accuracy']:.4f}")

# Show detailed results table
print("\n📋 DETAILED RESULTS:")
print("-" * 80)
display_df = df[['algorithm', 'config', 'accuracy', 'train_time', 'test_time', 'status']].copy()
print(display_df.to_string(index=False))

📊 RESULTS SUMMARY

🏆 PERFORMANCE BY ALGORITHM:
QuantAALTD2024      : 0.6902 ± 0.0081 (max: 0.6960) | 1.3s avg
HydraAALTD2024      : 0.5103 ± 0.0852 (max: 0.5706) | 103.9s avg
OldEnsemble         : 0.6907 ± 0.0140 (max: 0.7006) | 9.2s avg
NewEnsemble         : 0.6893 ± 0.0075 (max: 0.6947) | 300.5s avg

🎯 BEST CONFIGURATION PER ALGORITHM:
QuantAALTD2024      : balanced config → 0.6960
HydraAALTD2024      : balanced config → 0.5706
OldEnsemble         : balanced config → 0.7006
NewEnsemble         : balanced config → 0.6947

🔥 ENSEMBLE COMPARISON:
QuantAALTD2024      : 0.6960
HydraAALTD2024      : 0.5706
OldEnsemble         : 0.7006
NewEnsemble         : 0.6947

🏆 OVERALL WINNER: OldEnsemble (balanced) → 0.7006

📋 DETAILED RESULTS:
--------------------------------------------------------------------------------
     algorithm   config  accuracy  train_time  test_time  status
QuantAALTD2024     fast  0.684474    0.780864   0.212464 success
HydraAALTD2024     fast  0.450058    0.848194   1

In [8]:
# Analysis and insights
print("\n🔍 ANALYSIS & INSIGHTS")
print("=" * 80)

if len(successful_df) > 0:
    # Ensemble effectiveness
    quant_best = successful_df[successful_df['algorithm'] == 'QuantAALTD2024']['accuracy'].max() if 'QuantAALTD2024' in successful_df['algorithm'].values else 0
    hydra_best = successful_df[successful_df['algorithm'] == 'HydraAALTD2024']['accuracy'].max() if 'HydraAALTD2024' in successful_df['algorithm'].values else 0
    old_best = successful_df[successful_df['algorithm'] == 'OldEnsemble']['accuracy'].max() if 'OldEnsemble' in successful_df['algorithm'].values else 0
    new_best = successful_df[successful_df['algorithm'] == 'NewEnsemble']['accuracy'].max() if 'NewEnsemble' in successful_df['algorithm'].values else 0
    
    individual_best = max(quant_best, hydra_best)
    
    print(f"\n📈 Ensemble Effectiveness:")
    print(f"   Best individual algorithm: {individual_best:.4f}")
    print(f"   Old ensemble (data leak):  {old_best:.4f} ({old_best-individual_best:+.4f})")
    print(f"   New ensemble (clean CV):   {new_best:.4f} ({new_best-individual_best:+.4f})")
    
    if new_best > old_best:
        print(f"   ✅ Clean ensemble outperforms old ensemble by {new_best-old_best:.4f}")
    elif old_best > new_best:
        print(f"   ⚠️  Old ensemble still ahead by {old_best-new_best:.4f} (but has data leakage)")
    else:
        print(f"   🔄 Both ensembles perform similarly")
    
    # Runtime analysis
    print(f"\n⏱️  Runtime Analysis:")
    runtime_summary = successful_df.groupby('algorithm')['total_time'].agg(['mean', 'std']).round(2)
    for algo, row in runtime_summary.iterrows():
        print(f"   {algo:20s}: {row['mean']:.1f} ± {row['std']:.1f} seconds")
    
    # Configuration insights
    print(f"\n⚙️  Configuration Insights:")
    config_perf = successful_df.groupby('config')['accuracy'].agg(['mean', 'std']).round(4)
    for config, row in config_perf.iterrows():
        print(f"   {config:10s}: {row['mean']:.4f} ± {row['std']:.4f} average accuracy")

print("\n✅ Analysis complete!")


🔍 ANALYSIS & INSIGHTS

📈 Ensemble Effectiveness:
   Best individual algorithm: 0.6960
   Old ensemble (data leak):  0.7006 (+0.0046)
   New ensemble (clean CV):   0.6947 (-0.0013)
   ⚠️  Old ensemble still ahead by 0.0060 (but has data leakage)

⏱️  Runtime Analysis:
   HydraAALTD2024      : 103.9 ± 144.1 seconds
   NewEnsemble         : 300.5 ± 418.6 seconds
   OldEnsemble         : 9.2 ± 8.5 seconds
   QuantAALTD2024      : 1.3 ± 0.4 seconds

⚙️  Configuration Insights:
   balanced  : 0.6655 ± 0.0633 average accuracy
   fast      : 0.6248 ± 0.1165 average accuracy

✅ Analysis complete!


In [9]:
# Export results for further analysis
import os
from datetime import datetime

# Create results directory if it doesn't exist
results_dir = "/Users/urav/code/research/experiments/results"
os.makedirs(results_dir, exist_ok=True)

# Save results with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"{results_dir}/ensemble_comparison_{DATASET_NAME}_{timestamp}.csv"

df.to_csv(results_file, index=False)
print(f"📁 Results saved to: {results_file}")

# Also save configuration for reproducibility
config_file = f"{results_dir}/config_{DATASET_NAME}_{timestamp}.txt"
with open(config_file, 'w') as f:
    f.write(f"Dataset: {DATASET_NAME}\n")
    f.write(f"Train PCT: {TRAIN_PCT}\n")
    f.write(f"Test PCT: {TEST_PCT}\n")
    f.write(f"Random Seed: {RANDOM_SEED}\n")
    f.write(f"Configurations: {CONFIGS}\n")

print(f"⚙️  Configuration saved to: {config_file}")
print("\n🎉 Ensemble comparison complete!")

📁 Results saved to: /Users/urav/code/research/experiments/results/ensemble_comparison_Pedestrian_20250922_125426.csv
⚙️  Configuration saved to: /Users/urav/code/research/experiments/results/config_Pedestrian_20250922_125426.txt

🎉 Ensemble comparison complete!
