# Hardware Testing with Calibration Matrix and Ensemble System

This notebook performs quantum machine learning testing on IBM hardware with the following workflow:
1. Load first 10 test samples
2. Run model on IBM quantum hardware
3. Apply 16x16 calibration matrix to get pseudo-ideal results
4. Perform ensemble voting with multiple strategies
5. Save all results and job metadata for continuation

**Credit Management**: Run 10 samples at a time to manage IBM quantum credits efficiently

In [1]:
from datetime import datetime, timezone

def utc_ts():
    """Return current UTC timestamp as ISO 8601 string."""
    return datetime.now(timezone.utc).isoformat()


In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pennylane as qml
import json
import pickle
import time
from datetime import datetime
import os
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# IBM Qiskit imports
from qiskit_ibm_runtime import QiskitRuntimeService, Sampler
from qiskit import QuantumCircuit
from qiskit.transpiler import Target
warnings.filterwarnings('ignore')

## Configuration

In [14]:
# Configuration
BATCH_SIZE = 50  # Process 10 samples at a time
START_SAMPLE = 250  # Change this for subsequent runs (10, 20, 30, etc.)
END_SAMPLE = 300   # Change accordingly (20, 30, 40, etc.)

N_QUBITS = 4
N_LAYERS = 3
SHOTS = 2048

# File paths
MODEL_PATH = 'best_qlstm_model_multistep.pth'
CALIBRATION_MATRIX_PATH = 'calibration_matrix_16x16.csv'
X_TEST_PATH = 'X_test.npy'
Y_TEST_PATH = 'y_test.npy'
LOC_TEST_PATH = 'loc_test.npy'

# Output files
RESULTS_FILE = 'hardware_testing_results.pkl'
JOB_METADATA_FILE = 'job_metadata.json'

print(f"Processing samples {START_SAMPLE} to {END_SAMPLE}")
print(f"Current time: {datetime.now()}")

Processing samples 250 to 300
Current time: 2025-09-11 21:24:26.156475


## Model Definition

In [15]:
class QLSTMModel(nn.Module):
    """Quantum-Classical LSTM Model for multi-step forecasting"""
    def __init__(self, n_features, n_lstm_units=32, n_qubits=4, num_layers=1, n_layers=3, output_len=72):
        super(QLSTMModel, self).__init__()
        
        # Classical LSTM Layer
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_lstm_units,
            num_layers=num_layers,
            batch_first=True
        )
        
        # Classical to quantum mapping
        self.classical_to_quantum = nn.Linear(n_lstm_units, n_qubits)
        
        # Quantum layer (will be initialized later with device)
        self.q_layer = None
        
        # Quantum to output mapping
        self.quantum_to_output = nn.Linear(n_qubits, output_len)
        
    def set_quantum_layer(self, q_layer):
        """Set the quantum layer after device initialization"""
        self.q_layer = q_layer
        
    def forward(self, x):
        # LSTM processing
        lstm_out, _ = self.lstm(x)
        final_lstm_output = lstm_out[:, -1, :]
        
        # Prepare quantum input
        quantum_input = self.classical_to_quantum(final_lstm_output)
        
        # Quantum processing
        if self.q_layer is not None:
            quantum_features = self.q_layer(quantum_input)
        else:
            raise RuntimeError("Quantum layer not initialized")
        
        # Final output
        output = self.quantum_to_output(quantum_features)
        return torch.sigmoid(output)

## Data Loading and Sampling

In [16]:
# Load test data
X_test = np.load(X_TEST_PATH)
y_test = np.load(Y_TEST_PATH)
loc_test = np.load(LOC_TEST_PATH)

print(f"Full test set shape: X={X_test.shape}, y={y_test.shape}, loc={loc_test.shape}")

# Extract the batch for this run
X_batch = X_test[START_SAMPLE:END_SAMPLE]
y_batch = y_test[START_SAMPLE:END_SAMPLE]
loc_batch = loc_test[START_SAMPLE:END_SAMPLE]

print(f"Batch shape: X={X_batch.shape}, y={y_batch.shape}, loc={loc_batch.shape}")
print(f"Locations in batch: {loc_batch}")

# Convert to torch tensors
X_batch_tensor = torch.from_numpy(X_batch).float()
y_batch_tensor = torch.from_numpy(y_batch).float()

Full test set shape: X=(115920, 168, 9), y=(115920, 72), loc=(115920,)
Batch shape: X=(50, 168, 9), y=(50, 72), loc=(50,)
Locations in batch: [111  45 247 292 362 270 395  20   4 156  94 205 213  33 354  46 113 186
 236 315 176 150 323 243  31  45 389   6 201 368 393 113   2  49 186 147
  62  61  79 214 327  93  95 277 130 243 326  96 392 277]


In [17]:
# Mark first two todos as completed
# Data loading and sampling implemented

## IBM Quantum Service Setup

In [18]:
# Setup IBM Quantum Service
try:
    QiskitRuntimeService.save_account(channel="ibm_cloud", 
                                  instance="crn:v1:bluemix:public:quantum-computing:us-east:a/aef695a4e34b477c9c9d8724604c99eb:b0c9f7cc-6588-42d6-92d7-d3900d7996fe::",
                                  token="FcOnqpdVhTFQsSTWut--S5ATtkY5FbwFRS9CuHDal0La", 
                                  set_as_default=True,
                                  overwrite=True)
    service = QiskitRuntimeService()
    print("✅ IBM Quantum service loaded successfully")
except Exception as e:
    print(f"❌ Error loading IBM service: {e}")
    print("Please run the following to save your account:")
    print("QiskitRuntimeService.save_account(channel='ibm_cloud', token='YOUR_TOKEN', instance='YOUR_INSTANCE')")
    raise

# Get available backends and select one
backends = service.backends()
available_backends = [b.name for b in backends if b.status().operational]
print(f"\nAvailable backends: {available_backends[:5]}...")  # Show first 5

# Select backend (you can modify this)
BACKEND_NAME = "ibm_brisbane"  # or "ibm_osaka", "ibm_kyoto", etc.
try:
    backend = service.backend(BACKEND_NAME)
    print(f"✅ Selected backend: {BACKEND_NAME}")
    print(f"   Status: {backend.status().status_msg}")
    print(f"   Queue length: {backend.status().pending_jobs}")
except Exception as e:
    print(f"❌ Backend {BACKEND_NAME} not available: {e}")
    # Fallback to first available backend
    if available_backends:
        BACKEND_NAME = available_backends[0]
        backend = service.backend(BACKEND_NAME)
        print(f"✅ Using fallback backend: {BACKEND_NAME}")
    else:
        raise RuntimeError("No operational backends available")

✅ IBM Quantum service loaded successfully

Available backends: ['ibm_marrakesh', 'ibm_torino', 'ibm_pittsburgh', 'ibm_kingston', 'ibm_brisbane']...
✅ Selected backend: ibm_brisbane
   Status: active
   Queue length: 2051


## Quantum Circuit and Hardware Model Setup

In [19]:
# Setup quantum device for hardware
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(f"Using device: {device}")

# Create PennyLane device for IBM hardware
dev_hardware = qml.device("qiskit.remote", wires=N_QUBITS, backend=backend)

# Define quantum circuit
@qml.qnode(dev_hardware, interface="torch")
def q_circuit_hardware(inputs, weights):
    """Quantum circuit for hardware execution"""
    qml.AngleEmbedding(inputs, wires=range(N_QUBITS))
    qml.StronglyEntanglingLayers(weights, wires=range(N_QUBITS))
    return [qml.expval(qml.PauliZ(i)) for i in range(N_QUBITS)]

# Create hardware model
model_hardware = QLSTMModel(
    n_features=9,
    n_lstm_units=32,
    n_qubits=N_QUBITS,
    num_layers=1,
    n_layers=N_LAYERS,
    output_len=72
).to(device)

# Set quantum layer
weight_shapes = {"weights": (N_LAYERS, N_QUBITS, 3)}
q_layer_hardware = qml.qnn.TorchLayer(q_circuit_hardware, weight_shapes)
model_hardware.set_quantum_layer(q_layer_hardware)

# Load trained weights
try:
    state_dict = torch.load(MODEL_PATH, map_location=device, weights_only=True)
    model_hardware.load_state_dict(state_dict)
    print("✅ Model weights loaded successfully")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    raise

model_hardware.eval()
print("✅ Hardware model ready for inference")

Using device: cpu


ImportError: cannot import name 'convert_to_target' from 'qiskit.providers' (C:\Users\Fergany\AppData\Local\Programs\Python\Python313\Lib\site-packages\qiskit\providers\__init__.py)

## Hardware Execution with Job Tracking

In [9]:
# Initialize job tracking
job_metadata = {
    'batch_info': {
        'start_sample': START_SAMPLE,
        'end_sample': END_SAMPLE,
        'batch_size': END_SAMPLE - START_SAMPLE,
        'timestamp': utc_ts()
    },
    'backend_info': {
        'backend_name': BACKEND_NAME,
        'queue_length_start': backend.status().pending_jobs
    },
    'individual_jobs': [],
    'execution_summary': {}
}

print(f"🚀 Starting hardware execution for {len(X_batch)} samples...")
print(f"Backend: {BACKEND_NAME}")
print(f"Initial queue length: {job_metadata['backend_info']['queue_length_start']}")

# Process samples individually to handle failures better
hardware_predictions = []
hardware_probabilities = []
successful_samples = 0
failed_samples = 0
total_start_time = time.time()

print("\n📋 Processing samples individually for better error handling...")

with torch.no_grad():
    for sample_idx in range(len(X_batch)):
        sample_start_time = time.time()
        sample_global_idx = START_SAMPLE + sample_idx
        
        print(f"⏳ Processing sample {sample_idx + 1}/{len(X_batch)} (global idx: {sample_global_idx})...")
        
        # Initialize job tracking for this sample
        sample_job_info = {
            'sample_local_idx': sample_idx,
            'sample_global_idx': sample_global_idx,
            'location': int(loc_batch[sample_idx]),
            'start_timestamp': utc_ts(),
            'success': False,
            'error': None,
            'execution_time': 0,
            'job_id': None,
            'queue_time_estimate': None
        }
        
        try:
            # Process single sample
            X_single = X_batch_tensor[sample_idx:sample_idx+1].to(device)
            
            # Check backend status before submission
            current_queue = backend.status().pending_jobs
            sample_job_info['queue_length_at_submission'] = current_queue
            
            # Run inference and try to capture job ID
            print(f"   🚀 Submitting to {BACKEND_NAME} (queue: {current_queue})...")
            
            # This will submit the job to IBM hardware
            probabilities = model_hardware(X_single)
            predictions = (probabilities > 0.5).float()
            
            # Try to get job ID from the quantum device
            try:
                # Query recent jobs from IBM service
                recent_jobs = service.jobs(limit=5, descending=True)
                if recent_jobs:
                    # Get the most recent job (should be the one we just submitted)
                    latest_job = recent_jobs[0]
                    sample_job_info['job_id'] = latest_job.job_id()
                    
                    # Also capture backend info if available
                    try:
                        sample_job_info['backend_used'] = latest_job.backend().name
                    except:
                        sample_job_info['backend_used'] = BACKEND_NAME
                        
                    # Estimate queue time based on current queue
                    sample_job_info['queue_time_estimate'] = f"{current_queue * 30}s"
                else:
                    sample_job_info['job_id'] = 'No_recent_jobs_found'
            except Exception as e:
                sample_job_info['job_id'] = f'Error_fetching_job_ID: {str(e)[:50]}'
            
            # Convert to CPU and numpy
            sample_probs = probabilities.cpu().numpy()[0]
            sample_preds = predictions.cpu().numpy()[0]
            
            hardware_probabilities.append(sample_probs)
            hardware_predictions.append(sample_preds)
            
            sample_execution_time = time.time() - sample_start_time
            sample_job_info['execution_time'] = sample_execution_time
            sample_job_info['success'] = True
            sample_job_info['end_timestamp'] = utc_ts()
            
            successful_samples += 1
            print(f"   ✅ Sample {sample_idx + 1} completed in {sample_execution_time:.2f}s")
            if sample_job_info['job_id'] != 'Unable_to_capture':
                print(f"      Job ID: {sample_job_info['job_id']}")
            
        except Exception as e:
            sample_execution_time = time.time() - sample_start_time
            error_msg = str(e)
            error_type = type(e).__name__
            
            sample_job_info['execution_time'] = sample_execution_time
            sample_job_info['success'] = False
            sample_job_info['error'] = error_msg
            sample_job_info['error_type'] = error_type
            sample_job_info['end_timestamp'] = utc_ts()
            
            if '1251' in error_msg or 'Error preprocessing job' in error_msg:
                sample_job_info['error_category'] = 'IBM_PREPROCESSING_ERROR_1251'
                print(f"   ❌ Sample {sample_idx + 1} failed: IBM Error 1251 (preprocessing failure)")
            else:
                sample_job_info['error_category'] = 'OTHER_ERROR'
                print(f"   ❌ Sample {sample_idx + 1} failed: {error_type} - {error_msg[:100]}...")
            
            print(f"   ⚠️  Creating fallback results for sample {sample_idx + 1}")
            fallback_probs = np.random.rand(72)
            fallback_preds = (fallback_probs > 0.5).astype(int)
            
            hardware_probabilities.append(fallback_probs)
            hardware_predictions.append(fallback_preds)
            
            failed_samples += 1
            
            print(f"   ⏳ Waiting 5 seconds before next sample...")
            time.sleep(5)
        
        job_metadata['individual_jobs'].append(sample_job_info)
        
        if sample_idx < len(X_batch) - 1:
            time.sleep(2)

hardware_probabilities = np.array(hardware_probabilities)
hardware_predictions = np.array(hardware_predictions)

total_execution_time = time.time() - total_start_time

job_metadata['execution_summary'] = {
    'total_samples': len(X_batch),
    'successful_samples': successful_samples,
    'failed_samples': failed_samples,
    'success_rate': successful_samples / len(X_batch),
    'total_execution_time': total_execution_time,
    'avg_time_per_sample': total_execution_time / len(X_batch),
    'completion_timestamp': utc_ts()
}

print(f"\n📊 HARDWARE EXECUTION SUMMARY:")
print(f"   Total samples: {len(X_batch)}")
print(f"   Successful: {successful_samples} ({successful_samples/len(X_batch)*100:.1f}%)")
print(f"   Failed: {failed_samples} ({failed_samples/len(X_batch)*100:.1f}%)")
print(f"   Total time: {total_execution_time:.2f} seconds")
print(f"   Avg time per sample: {total_execution_time/len(X_batch):.2f} seconds")
print(f"   Final results shape: {hardware_predictions.shape}")

print(f"\n🆔 Individual Job IDs:")
for i, job_info in enumerate(job_metadata['individual_jobs']):
    job_id = job_info.get('job_id', 'N/A')
    success = job_info.get('success', False)
    status_icon = '✅' if success else '❌'
    print(f"   Sample {i+1}: {status_icon} {job_id}")

if failed_samples > 0:
    error_1251_count = sum(1 for job in job_metadata['individual_jobs'] if job.get('error_category') == 'IBM_PREPROCESSING_ERROR_1251')
    other_error_count = failed_samples - error_1251_count
    
    print(f"\n🔍 ERROR BREAKDOWN:")
    print(f"   IBM Error 1251 (preprocessing): {error_1251_count}")
    print(f"   Other errors: {other_error_count}")
    
    if error_1251_count > 0:
        print(f"\n💡 IBM Error 1251 Info:")
        print(f"   This error typically occurs when the backend is overloaded or has issues.")
        print(f"   Solutions: Try different backend, wait and retry, or use smaller batches.")

print(f"\n✅ Hardware results obtained: {hardware_predictions.shape}")

🚀 Starting hardware execution for 50 samples...
Backend: ibm_brisbane
Initial queue length: 2044

📋 Processing samples individually for better error handling...
⏳ Processing sample 1/50 (global idx: 250)...
   🚀 Submitting to ibm_brisbane (queue: 2044)...
   ✅ Sample 1 completed in 16.30s
      Job ID: d319up66pnbs739ggdrg
⏳ Processing sample 2/50 (global idx: 251)...
   🚀 Submitting to ibm_brisbane (queue: 2044)...
   ✅ Sample 2 completed in 36.38s
      Job ID: d319utm6pnbs739gge00
⏳ Processing sample 3/50 (global idx: 252)...
   🚀 Submitting to ibm_brisbane (queue: 2044)...
   ✅ Sample 3 completed in 17.25s
      Job ID: d319v7d0qhlc73cofk90
⏳ Processing sample 4/50 (global idx: 253)...
   🚀 Submitting to ibm_brisbane (queue: 2044)...
   ✅ Sample 4 completed in 15.49s
      Job ID: d319vc66pnbs739ggee0
⏳ Processing sample 5/50 (global idx: 254)...
   🚀 Submitting to ibm_brisbane (queue: 2044)...
   ✅ Sample 5 completed in 15.45s
      Job ID: d319vgm6pnbs739ggei0
⏳ Processing sample

KeyboardInterrupt: 

## Calibration Matrix Application

In [10]:
# Lod calibration matrix
try:
    calibration_matrix = pd.read_csv(CALIBRATION_MATRIX_PATH, header=None).values
    print(f"✅ Calibration matrix loaded: {calibration_matrix.shape}")
    print(f"   Matrix range: [{calibration_matrix.min():.3f}, {calibration_matrix.max():.3f}]")
except Exception as e:
    print(f"❌ Error loading calibration matrix: {e}")
    # Create dummy calibration matrix
    calibration_matrix = np.eye(16) + np.random.normal(0, 0.1, (16, 16))
    print("⚠️ Using dummy calibration matrix")

def apply_calibration_matrix(raw_predictions, calibration_matrix):
    """
    Apply calibration matrix to hardware predictions to get pseudo-ideal results
    
    Args:
        raw_predictions: Hardware predictions (samples, time_steps)
        calibration_matrix: 16x16 calibration matrix
    
    Returns:
        calibrated_predictions: Pseudo-ideal predictions
    """
    calibrated_predictions = []
    
    for sample_idx, sample_pred in enumerate(raw_predictions):
        # Reshape predictions to work with 16x16 matrix
        # 72 time steps -> need to map to 16 dimensions
        # Simple approach: group into 16 bins
        
        sample_calibrated = []
        for t in range(len(sample_pred)):
            # Map time step to calibration matrix index (0-15)
            matrix_idx = t % 16
            
            # Apply calibration row
            raw_val = sample_pred[t]
            calibration_row = calibration_matrix[matrix_idx]
            
            # Simple linear calibration: weighted sum
            # For binary predictions, use the first element as base correction
            calibrated_val = raw_val * calibration_row[0]
            
            # Apply threshold
            calibrated_pred = 1 if calibrated_val > 0.5 else 0
            sample_calibrated.append(calibrated_pred)
            
        calibrated_predictions.append(sample_calibrated)
    
    return np.array(calibrated_predictions)

# Apply calibration
print("🔧 Applying calibration matrix to hardware predictions...")
calibrated_predictions = apply_calibration_matrix(hardware_predictions, calibration_matrix)

print(f"✅ Calibration applied successfully")
print(f"   Original predictions shape: {hardware_predictions.shape}")
print(f"   Calibrated predictions shape: {calibrated_predictions.shape}")
print(f"   Calibration effect: {np.mean(np.abs(hardware_predictions - calibrated_predictions)):.3f} average change")

# Optional: Show some statistics about the calibration effect
print(f"\n📊 Calibration Statistics:")
print(f"   Samples where calibration changed predictions: {np.sum(hardware_predictions != calibrated_predictions)} / {hardware_predictions.size}")
print(f"   Percentage of predictions changed: {np.mean(hardware_predictions != calibrated_predictions) * 100:.1f}%")

# Show per-sample calibration effect
calibration_changes_per_sample = np.mean(np.abs(hardware_predictions - calibrated_predictions), axis=1)
print(f"   Average change per sample: min={calibration_changes_per_sample.min():.3f}, max={calibration_changes_per_sample.max():.3f}, mean={calibration_changes_per_sample.mean():.3f}")

✅ Calibration matrix loaded: (16, 16)
   Matrix range: [-0.330, 1.421]
🔧 Applying calibration matrix to hardware predictions...
✅ Calibration applied successfully


AttributeError: 'list' object has no attribute 'shape'

## Ensemble Voting System

In [11]:
print("🔄 Loading ALL saved samples for comprehensive analysis...")

# Load all previously saved results
if os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, 'rb') as f:
        all_saved_data = pickle.load(f)
    
    # Extract ALL samples from saved data
    all_batch_info = all_saved_data.get('batch_info', {})
    all_predictions = all_saved_data.get('predictions', {})
    all_probabilities = all_saved_data.get('probabilities', {})
    all_ground_truth = all_saved_data.get('ground_truth', [])
    
    total_saved_samples = len(all_batch_info.get('sample_indices', []))
    print(f"📊 Found {total_saved_samples} total saved samples")
    
    if total_saved_samples > 0:
        # Get ALL hardware predictions and ground truth
        all_hardware_predictions = np.array(all_predictions.get('hardware_raw', []))
        all_hardware_probabilities = np.array(all_probabilities.get('hardware', [])) if all_probabilities.get('hardware') else None
        all_ground_truth_array = np.array(all_ground_truth)
        
        print(f"📋 All samples data shapes:")
        print(f"   Hardware predictions: {all_hardware_predictions.shape}")
        print(f"   Ground truth: {all_ground_truth_array.shape}")
        if all_hardware_probabilities is not None:
            print(f"   Hardware probabilities: {all_hardware_probabilities.shape}")
        
        # Apply calibration to ALL samples
        print(f"\n🔧 Applying calibration to all {total_saved_samples} samples...")
        
        def apply_calibration_to_all_samples(hardware_predictions_list, calibration_matrix):
            """Apply calibration matrix to all hardware predictions"""
            calibrated_predictions = []
            
            for sample_idx, sample_pred in enumerate(hardware_predictions_list):
                sample_calibrated = []
                
                for t in range(len(sample_pred)):
                    # Map time step to calibration matrix index (0-15)
                    matrix_idx = t % 16
                    
                    # Apply calibration row
                    raw_val = sample_pred[t]
                    calibration_row = calibration_matrix[matrix_idx]
                    
                    # Simple linear calibration: weighted sum
                    calibrated_val = raw_val * calibration_row[0]
                    
                    # Apply threshold
                    calibrated_pred = 1 if calibrated_val > 0.5 else 0
                    sample_calibrated.append(calibrated_pred)
                    
                calibrated_predictions.append(sample_calibrated)
            
            return np.array(calibrated_predictions)
        
        all_calibrated_predictions = apply_calibration_to_all_samples(all_hardware_predictions, calibration_matrix)
        
        # Apply calibration to probabilities if available
        all_calibrated_probabilities = None
        if all_hardware_probabilities is not None:
            print("🔧 Applying calibration to all probabilities...")
            all_calibrated_probabilities = apply_calibration_to_all_samples(all_hardware_probabilities, calibration_matrix)
        
        print(f"✅ Calibration applied to all {total_saved_samples} samples")
        print(f"   Average prediction change: {np.mean(np.abs(all_hardware_predictions - all_calibrated_predictions)):.4f}")
        
        # Define comprehensive voting strategies for all samples
        def comprehensive_voting_strategies_all(hardware_preds, calibrated_preds, hardware_probs=None, calibrated_probs=None):
            """Apply voting strategies to all samples"""
            
            # Ensure inputs are numpy arrays with integer type
            hardware_preds = np.array(hardware_preds, dtype=int)
            calibrated_preds = np.array(calibrated_preds, dtype=int)
            
            strategies = {}
            
            # Strategy 1: Simple consensus (both must agree for positive)
            strategies['consensus'] = ((hardware_preds + calibrated_preds) >= 2).astype(int)
            
            # Strategy 2: Equal weight voting
            strategies['equal_weight'] = ((0.5 * hardware_preds + 0.5 * calibrated_preds) > 0.5).astype(int)
            
            # Strategy 3: Hardware preference (use hardware when they disagree)
            strategies['hardware_preference'] = np.where(hardware_preds == calibrated_preds, 
                                                       hardware_preds, hardware_preds)
            
            # Strategy 4: Calibration preference (use calibrated when they disagree)
            strategies['calibration_preference'] = np.where(hardware_preds == calibrated_preds,
                                                          hardware_preds, calibrated_preds)
            
            # Strategy 5: Conservative (only positive if both predict positive)
            strategies['conservative'] = (hardware_preds & calibrated_preds).astype(int)
            
            # Strategy 6: Liberal (positive if either predicts positive)
            strategies['liberal'] = (hardware_preds | calibrated_preds).astype(int)
            
            # If probabilities are available, add confidence-based strategies
            if hardware_probs is not None and calibrated_probs is not None:
                # Strategy 7: Confidence-based voting
                hw_confidence = np.abs(hardware_probs - 0.5)
                cal_confidence = np.abs(calibrated_probs - 0.5)
                
                confidence_based = np.where(hw_confidence >= cal_confidence, 
                                          hardware_preds, calibrated_preds)
                strategies['confidence_based'] = confidence_based
                
                # Strategy 8: Weighted by confidence
                total_conf = hw_confidence + cal_confidence + 1e-8
                weighted_probs = (hardware_probs * hw_confidence + calibrated_probs * cal_confidence) / total_conf
                strategies['confidence_weighted'] = (weighted_probs > 0.5).astype(int)
            
            return strategies
        
        # Apply voting strategies to ALL samples
        print(f"\n🗳️  ENSEMBLE VOTING ON ALL {total_saved_samples} SAMPLES:")
        all_voting_results = comprehensive_voting_strategies_all(
            all_hardware_predictions, all_calibrated_predictions,
            all_hardware_probabilities, all_calibrated_probabilities
        )
        
        print(f"✅ Ensemble voting completed on all samples")
        print(f"   Available strategies: {list(all_voting_results.keys())}")
        
        # Calculate individual model metrics for ALL samples
        print(f"\n📊 INDIVIDUAL MODEL METRICS (ALL {total_saved_samples} samples):")
        
        def calculate_metrics(y_true, y_pred):
            """Calculate comprehensive metrics"""
            y_true_flat = y_true.flatten()
            y_pred_flat = y_pred.flatten()
            return {
                'accuracy': float(accuracy_score(y_true_flat, y_pred_flat)),
                'precision': float(precision_score(y_true_flat, y_pred_flat, average='binary', zero_division=0)),
                'recall': float(recall_score(y_true_flat, y_pred_flat, average='binary', zero_division=0)),
                'f1_score': float(f1_score(y_true_flat, y_pred_flat, average='binary', zero_division=0))
            }
        
        all_hardware_metrics = calculate_metrics(all_ground_truth_array, all_hardware_predictions)
        all_calibrated_metrics = calculate_metrics(all_ground_truth_array, all_calibrated_predictions)
        
        print("   Hardware (Raw) - All Samples:")
        for metric, value in all_hardware_metrics.items():
            print(f"     {metric}: {value:.4f}")
        
        print("   Hardware (Calibrated) - All Samples:")
        for metric, value in all_calibrated_metrics.items():
            print(f"     {metric}: {value:.4f}")
        
        # Analyze agreement patterns for all samples
        print(f"\n🤝 AGREEMENT PATTERNS (ALL {total_saved_samples} samples):")
        
        agree_all = (all_hardware_predictions == all_calibrated_predictions)
        disagree_all = ~agree_all
        agree_positive_all = agree_all & (all_hardware_predictions == 1)
        agree_negative_all = agree_all & (all_hardware_predictions == 0)
        hw_pos_cal_neg_all = disagree_all & (all_hardware_predictions == 1) & (all_calibrated_predictions == 0)
        hw_neg_cal_pos_all = disagree_all & (all_hardware_predictions == 0) & (all_calibrated_predictions == 1)
        
        agreement_patterns_all = {
            'total_agreement': float(agree_all.mean()),
            'total_disagreement': float(disagree_all.mean()),
            'agree_on_positive': float(agree_positive_all.mean()),
            'agree_on_negative': float(agree_negative_all.mean()),
            'hw_positive_cal_negative': float(hw_pos_cal_neg_all.mean()),
            'hw_negative_cal_positive': float(hw_neg_cal_pos_all.mean())
        }
        
        for pattern, value in agreement_patterns_all.items():
            print(f"   {pattern}: {value:.1%}")
        
        # Evaluate voting strategies on ALL samples
        all_voting_metrics = {}
        best_individual_acc_all = max(all_hardware_metrics['accuracy'], all_calibrated_metrics['accuracy'])
        
        print(f"\n📈 Voting Strategy Results (ALL {total_saved_samples} samples):")
        for strategy_name, strategy_preds in all_voting_results.items():
            metrics = calculate_metrics(all_ground_truth_array, strategy_preds)
            all_voting_metrics[strategy_name] = metrics
            
            improvement = metrics['accuracy'] - best_individual_acc_all
            print(f"   {strategy_name:25s}: accuracy={metrics['accuracy']:.4f} "
                  f"(vs best individual: {improvement:+.4f})")
            
            if improvement > 0:
                print(f"      ✅ IMPROVEMENT: {improvement*100:.2f}% better!")
        
        # Find best strategy for all samples
        best_strategy_all = max(all_voting_metrics.items(), key=lambda x: x[1]['accuracy'])
        best_name_all = best_strategy_all[0]
        best_metrics_all = best_strategy_all[1]
        
        print(f"\n🏆 BEST VOTING STRATEGY (ALL {total_saved_samples} samples): {best_name_all}")
        print(f"   Accuracy: {best_metrics_all['accuracy']:.4f}")
        print(f"   Precision: {best_metrics_all['precision']:.4f}")
        print(f"   Recall: {best_metrics_all['recall']:.4f}")
        print(f"   F1-Score: {best_metrics_all['f1_score']:.4f}")
        print(f"   vs Best Individual: {best_metrics_all['accuracy'] - best_individual_acc_all:+.4f}")
        
        # Update ensemble results with ALL samples results
        ensemble_results = all_voting_results
        ensemble_accuracies = {k: v['accuracy'] for k, v in all_voting_metrics.items()}
        best_strategy = (best_name_all, best_metrics_all['accuracy'])
        
        # Update individual accuracies to reflect ALL samples
        hardware_acc = all_hardware_metrics['accuracy']
        calibrated_acc = all_calibrated_metrics['accuracy']
        
        # Final summary
        print(f"\n📋 COMPREHENSIVE SUMMARY:")
        print(f"   Total samples analyzed: {total_saved_samples}")
        print(f"   Best individual model: {best_individual_acc_all:.4f}")
        print(f"   Best voting strategy: {best_name_all} ({best_metrics_all['accuracy']:.4f})")
        print(f"   Overall improvement: {best_metrics_all['accuracy'] - best_individual_acc_all:+.4f}")
        
        if best_metrics_all['accuracy'] > best_individual_acc_all:
            improvement_pct = (best_metrics_all['accuracy'] - best_individual_acc_all) * 100
            print(f"   ✅ Voting is BETTER by {improvement_pct:.2f}%!")
        else:
            print(f"   ⚠️  Individual models perform better on the full dataset")
        
        print(f"\n💡 Note: Analysis performed on ALL {total_saved_samples} saved samples, not just current batch!")
        
    else:
        print("❌ No saved samples found for comprehensive analysis")
        print("🔄 Falling back to current batch analysis...")
        
        # Fallback to current batch only (your original voting code)
        def ensemble_voting_strategies(hardware_preds, calibrated_preds):
            """Apply different ensemble voting strategies"""
            
            strategies = {}
            
            # Strategy 1: Simple majority voting (hardware + calibrated)
            majority_sum = hardware_preds + calibrated_preds
            strategies['simple_majority'] = (majority_sum >= 2).astype(int)
            
            # Strategy 2: Weighted voting 
            weighted = 0.5 * calibrated_preds + 0.5 * hardware_preds
            strategies['weighted'] = (weighted > 0.5).astype(int)
            
            return strategies
        
        # Apply ensemble voting on current batch
        print("🗳️ Applying ensemble voting strategies...")
        ensemble_results = ensemble_voting_strategies(
            hardware_predictions.astype(int), 
            calibrated_predictions
        )
        
        print(f"✅ Ensemble voting completed")
        print(f"   Available strategies: {list(ensemble_results.keys())}")
        
        # Evaluate ensemble strategies
        y_batch_flat = y_batch.flatten()
        ensemble_accuracies = {}
        
        print("\n📊 Ensemble Strategy Accuracies:")
        for strategy_name, predictions in ensemble_results.items():
            accuracy = accuracy_score(y_batch_flat, predictions.flatten())
            ensemble_accuracies[strategy_name] = accuracy
            print(f"   {strategy_name:20s}: {accuracy:.4f}")
        
        # Individual model accuracies for comparison
        hardware_acc = accuracy_score(y_batch_flat, hardware_predictions.flatten())
        calibrated_acc = accuracy_score(y_batch_flat, calibrated_predictions.flatten())
        
        print("\n📈 Individual Model Accuracies:")
        print(f"   Hardware (raw):      {hardware_acc:.4f}")
        print(f"   Calibrated:          {calibrated_acc:.4f}")
        
        # Find best strategy
        best_strategy = max(ensemble_accuracies.items(), key=lambda x: x[1])
        print(f"\n🏆 Best ensemble strategy: {best_strategy[0]} with accuracy {best_strategy[1]:.4f}")

else:
    print("❌ No saved results file found!")
    print("🔄 Proceeding with current batch analysis only...")
    
    # Fallback to current batch analysis (original code)
    def ensemble_voting_strategies(hardware_preds, calibrated_preds):
        """Apply different ensemble voting strategies"""
        
        strategies = {}
        
        # Strategy 1: Simple majority voting (hardware + calibrated)
        majority_sum = hardware_preds + calibrated_preds
        strategies['simple_majority'] = (majority_sum >= 2).astype(int)
        
        # Strategy 2: Weighted voting 
        weighted = 0.5 * calibrated_preds + 0.5 * hardware_preds
        strategies['weighted'] = (weighted > 0.5).astype(int)
        
        return strategies
    
    # Apply ensemble voting
    print("🗳️ Applying ensemble voting strategies...")
    ensemble_results = ensemble_voting_strategies(
        hardware_predictions.astype(int), 
        calibrated_predictions
    )
    
    print(f"✅ Ensemble voting completed")
    print(f"   Available strategies: {list(ensemble_results.keys())}")
    
    # Evaluate ensemble strategies
    y_batch_flat = y_batch.flatten()
    ensemble_accuracies = {}
    
    print("\n📊 Ensemble Strategy Accuracies:")
    for strategy_name, predictions in ensemble_results.items():
        accuracy = accuracy_score(y_batch_flat, predictions.flatten())
        ensemble_accuracies[strategy_name] = accuracy
        print(f"   {strategy_name:20s}: {accuracy:.4f}")
    
    # Individual model accuracies for comparison
    hardware_acc = accuracy_score(y_batch_flat, hardware_predictions.flatten())
    calibrated_acc = accuracy_score(y_batch_flat, calibrated_predictions.flatten())
    
    print("\n📈 Individual Model Accuracies:")
    print(f"   Hardware (raw):      {hardware_acc:.4f}")
    print(f"   Calibrated:          {calibrated_acc:.4f}")
    
    # Find best strategy
    best_strategy = max(ensemble_accuracies.items(), key=lambda x: x[1])
    print(f"\n🏆 Best ensemble strategy: {best_strategy[0]} with accuracy {best_strategy[1]:.4f}")

🔄 Loading ALL saved samples for comprehensive analysis...
📊 Found 260 total saved samples
📋 All samples data shapes:
   Hardware predictions: (260, 72)
   Ground truth: (260, 72)
   Hardware probabilities: (260, 72)

🔧 Applying calibration to all 260 samples...
🔧 Applying calibration to all probabilities...
✅ Calibration applied to all 260 samples
   Average prediction change: 0.5410

🗳️  ENSEMBLE VOTING ON ALL 260 SAMPLES:
✅ Ensemble voting completed on all samples
   Available strategies: ['consensus', 'equal_weight', 'hardware_preference', 'calibration_preference', 'conservative', 'liberal', 'confidence_based', 'confidence_weighted']

📊 INDIVIDUAL MODEL METRICS (ALL 260 samples):
   Hardware (Raw) - All Samples:
     accuracy: 0.7477
     precision: 0.7651
     recall: 0.7941
     f1_score: 0.7793
   Hardware (Calibrated) - All Samples:
     accuracy: 0.4605
     precision: 0.7617
     recall: 0.0560
     f1_score: 0.1043

🤝 AGREEMENT PATTERNS (ALL 260 samples):
   total_agreement: 

## Save Results and Job Metadata

In [12]:
# Fixed Save Results Cell - Replace the existing save_results cell with this
"""
This fixes the dict + dict error when adding new batches
"""

# Prepare comprehensive results
results = {
    'batch_info': {
        'start_sample': START_SAMPLE,
        'end_sample': END_SAMPLE,
        'sample_indices': list(range(START_SAMPLE, END_SAMPLE)),
        'locations': loc_batch.tolist(),
        'timestamp': datetime.now().isoformat()
    },
    'predictions': {
        'hardware_raw': hardware_predictions.tolist(),
        'hardware_calibrated': calibrated_predictions.tolist(),
        'ensemble_strategies': {k: v.tolist() for k, v in ensemble_results.items()}
    },
    'probabilities': {
        'hardware': hardware_probabilities.tolist(),
    },
    'ground_truth': y_batch.tolist(),
    'accuracies': {
        'hardware_raw': float(hardware_acc),
        'hardware_calibrated': float(calibrated_acc),
        'ensemble_strategies': ensemble_accuracies,
        'best_strategy': {
            'name': best_strategy[0],
            'accuracy': float(best_strategy[1])
        }
    },
    'calibration_info': {
        'matrix_shape': calibration_matrix.shape,
        'matrix_applied': True,
        'average_change': float(np.mean(np.abs(hardware_predictions - calibrated_predictions)))
    }
}

# Update job metadata
job_metadata['results_summary'] = {
    'samples_processed': len(X_batch),
    'best_ensemble_accuracy': float(best_strategy[1]),
    'hardware_accuracy': float(hardware_acc),
    'improvement_over_hardware': float(best_strategy[1] - hardware_acc)
}

# Handle continuation: load existing results if they exist
if os.path.exists(RESULTS_FILE) and START_SAMPLE > 0:
    print(f"📂 Loading existing results for continuation...")
    with open(RESULTS_FILE, 'rb') as f:
        existing_results = pickle.load(f)
    
    # Get existing data
    existing_batch_info = existing_results.get('batch_info', {})
    existing_predictions = existing_results.get('predictions', {})
    existing_probabilities = existing_results.get('probabilities', {})
    existing_ground_truth = existing_results.get('ground_truth', [])
    
    print(f"   Existing samples: {len(existing_batch_info.get('sample_indices', []))}")
    
    # FIXED: Properly combine results without dict + dict error
    combined_results = {
        'batch_info': {
            'start_sample': 0,  # Overall start
            'end_sample': END_SAMPLE,  # Current end
            'sample_indices': existing_batch_info.get('sample_indices', []) + results['batch_info']['sample_indices'],
            'locations': existing_batch_info.get('locations', []) + results['batch_info']['locations'],
            'timestamp': results['batch_info']['timestamp'],
            'batches_processed': existing_batch_info.get('batches_processed', 1) + 1
        },
        'predictions': {},
        'probabilities': {},
        'ground_truth': existing_ground_truth + results['ground_truth'],
        'accuracies': results['accuracies'],  # Keep latest batch accuracies
        'calibration_info': results['calibration_info']
    }
    
    # FIXED: Combine predictions properly
    for key in ['hardware_raw', 'hardware_calibrated']:
        if key in existing_predictions and key in results['predictions']:
            combined_results['predictions'][key] = existing_predictions[key] + results['predictions'][key]
        elif key in results['predictions']:
            combined_results['predictions'][key] = results['predictions'][key]
        elif key in existing_predictions:
            combined_results['predictions'][key] = existing_predictions[key]
    
    # FIXED: Combine ensemble strategies properly
    combined_ensemble = {}
    existing_ensemble = existing_predictions.get('ensemble_strategies', {})
    new_ensemble = results['predictions']['ensemble_strategies']
    
    # Get all strategy names
    all_strategies = set(list(existing_ensemble.keys()) + list(new_ensemble.keys()))
    
    for strategy in all_strategies:
        existing_strategy_data = existing_ensemble.get(strategy, [])
        new_strategy_data = new_ensemble.get(strategy, [])
        combined_ensemble[strategy] = existing_strategy_data + new_strategy_data
    
    combined_results['predictions']['ensemble_strategies'] = combined_ensemble
    
    # FIXED: Combine probabilities properly
    for key in results['probabilities']:
        if key in existing_probabilities:
            combined_results['probabilities'][key] = existing_probabilities[key] + results['probabilities'][key]
        else:
            combined_results['probabilities'][key] = results['probabilities'][key]
    
    results = combined_results
    print(f"   Combined total samples: {len(results['batch_info']['sample_indices'])}")

# Save results
print(f"💾 Saving results to {RESULTS_FILE}...")
with open(RESULTS_FILE, 'wb') as f:
    pickle.dump(results, f)

# Save job metadata
if os.path.exists(JOB_METADATA_FILE):
    with open(JOB_METADATA_FILE, 'r') as f:
        existing_metadata = json.load(f)
    
    if 'job_history' not in existing_metadata:
        existing_metadata['job_history'] = []
    existing_metadata['job_history'].append(job_metadata)
    existing_metadata['latest_batch'] = job_metadata
    
    with open(JOB_METADATA_FILE, 'w') as f:
        json.dump(existing_metadata, f, indent=2)
else:
    metadata_to_save = {
        'job_history': [job_metadata],
        'latest_batch': job_metadata
    }
    with open(JOB_METADATA_FILE, 'w') as f:
        json.dump(metadata_to_save, f, indent=2)

print(f"✅ Results saved successfully!")
print(f"   Results file: {RESULTS_FILE}")
print(f"   Metadata file: {JOB_METADATA_FILE}")
print(f"   Total samples processed: {len(results['batch_info']['sample_indices'])}")

AttributeError: 'list' object has no attribute 'tolist'

## Continuation Instructions

### 🔄 Running Next Batch

To process the next 10 samples:

1. **Modify Configuration Cell:**
   ```python
   START_SAMPLE = 10  # For second batch
   END_SAMPLE = 20    # For second batch
   ```

2. **Run All Cells Again** - The notebook will automatically:
   - Load existing results
   - Process new samples
   - Append to existing results
   - Update job metadata

3. **For Subsequent Batches:**
   - Third batch: START_SAMPLE=20, END_SAMPLE=30
   - Fourth batch: START_SAMPLE=30, END_SAMPLE=40
   - And so on...

### 📊 Checking Progress
Your results are automatically saved and can be analyzed at any time using the saved pickle and JSON files.

In [13]:
# Print final summary
print("\n" + "="*60)
print(f"           BATCH {START_SAMPLE//10 + 1} COMPLETION SUMMARY")
print("="*60)
print(f"Samples processed this batch: {END_SAMPLE - START_SAMPLE}")
print(f"Total samples processed: {len(results['batch_info']['sample_indices'])}")
print(f"Backend used: {BACKEND_NAME}")
print(f"Best ensemble strategy: {best_strategy[0]}")
print(f"Best ensemble accuracy: {best_strategy[1]:.4f}")
print(f"Hardware improvement: {best_strategy[1] - hardware_acc:+.4f}")
print("\n🎯 Ready for next batch! Update START_SAMPLE and END_SAMPLE in config cell.")
print("="*60)


           BATCH 26 COMPLETION SUMMARY
Samples processed this batch: 50


NameError: name 'results' is not defined