In [None]:
import pandas as pd
# Updated to use medical classification datasets
df_class_simple = pd.read_csv('breast_cancer.csv')
df_class_med = pd.read_csv('heart_disease.csv')
df_class_complex = pd.read_csv('diabetes.csv')

df_reg_simple = pd.read_csv('housing.csv')
df_reg_med = pd.read_csv('real_estate_valuation.csv')
df_reg_complex = pd.read_csv('Housing.csv')

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# Data preprocessing for different classification datasets
def preprocess_classification_data(df, dataset_type='auto'):
    # Make a copy to avoid modifying the original
    data = df.copy()
    
    # Auto-detect dataset type based on columns
    if dataset_type == 'auto':
        if 'Diagnosis' in data.columns and 'radius1' in data.columns:
            dataset_type = 'simple'  # Breast cancer dataset
        elif 'num' in data.columns and 'age' in data.columns and 'sex' in data.columns:
            dataset_type = 'medium'  # Heart disease dataset
        elif 'Outcome' in data.columns and 'Pregnancies' in data.columns:
            dataset_type = 'complex'  # Diabetes dataset
        else:
            raise ValueError("Cannot auto-detect dataset type. Please specify manually.")
    
    label_encoders = {}
    
    
    if dataset_type == 'simple':
        # Simple dataset - Breast Cancer Wisconsin dataset
        target_column = 'Diagnosis'
        
        # Convert target to binary (0 for Benign, 1 for Malignant)
        data[target_column] = (data[target_column] == 'M').astype(int)
        
        # All other columns are numeric features (no categorical encoding needed)
        print(f"Breast cancer dataset: {data.shape[1]-1} numeric features")
    
    elif dataset_type == 'medium':
        # Medium dataset - Heart Disease dataset
        target_column = 'num'
        
        # Convert multi-class target to binary classification (0 vs >0)
        data[target_column] = (data[target_column] > 0).astype(int)
        
        # All features are already numeric (no categorical encoding needed)
        print(f"Heart disease dataset: {data.shape[1]-1} numeric features")
    
    elif dataset_type == 'complex':
        # Complex dataset - Diabetes dataset
        target_column = 'Outcome'
        
        # Handle missing values encoded as 0s in medical measurements
        print("Handling missing values (represented as 0s in medical measurements)")
        
        # Columns where 0 is not medically possible and indicates missing data
        zero_not_acceptable = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
        
        # Replace 0s with NaN for these columns, then impute with median
        for col in zero_not_acceptable:
            if col in data.columns:
                # Replace 0 with NaN (except for Insulin where 0 might be acceptable)
                if col != 'Insulin':
                    data[col] = data[col].replace(0, np.nan)
                else:
                    # For Insulin, only replace very low values that seem unrealistic
                    data[col] = data[col].replace(0, np.nan)
                
                # Impute missing values with median
                data[col].fillna(data[col].median(), inplace=True)
        
        print(f"Diabetes dataset: {data.shape[1]-1} numeric features")
    
    else:
        raise ValueError(f"Unknown dataset type: {dataset_type}")
    
    # Final check for missing values
    missing_count = data.isnull().sum().sum()
    if missing_count > 0:
        print(f"Warning: Found {missing_count} remaining missing values. Removing rows with missing data.")
        data = data.dropna()
    
    return data, label_encoders, target_column

# Function to select and preprocess classification dataset
def select_classification_dataset(choice='simple'):
    if choice == 'simple':
        df = df_class_simple
        dataset_name = "Breast Cancer Wisconsin Dataset"
    elif choice == 'medium':
        df = df_class_med
        dataset_name = "Heart Disease Dataset"
    elif choice == 'complex':
        df = df_class_complex
        dataset_name = "Pima Indian Diabetes Dataset"
    else:
        raise ValueError("Choice must be 'simple', 'medium', or 'complex'")
    
    processed_data, label_encoders, target_column = preprocess_classification_data(df, choice)
    
    print(f"\n=== {dataset_name} ===")
    print(f"Original shape: {df.shape}")
    print(f"Processed shape: {processed_data.shape}")
    print(f"Target column: {target_column}")
    print(f"Target distribution: {processed_data[target_column].value_counts().to_dict()}")
    if choice == 'simple':
        print(f"Target meaning: 0=Benign, 1=Malignant")
    elif choice == 'medium':
        print(f"Target meaning: 0=No Heart Disease, 1=Heart Disease")
    elif choice == 'complex':
        print(f"Target meaning: 0=No Diabetes, 1=Diabetes")
    print(f"Number of features: {len([col for col in processed_data.columns if col != target_column])}")
    
    return processed_data, label_encoders, target_column, dataset_name

DATASET_CHOICE = 'medium'  # Change this to 'simple' (Breast Cancer), 'medium' (Heart Disease), or 'complex' (Diabetes)

processed_data, label_encoders, target_column, dataset_name = select_classification_dataset(DATASET_CHOICE)
print(f"\nSelected dataset: {dataset_name}")
print(f"Data types: {processed_data.dtypes.value_counts()}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Data preprocessing for different regression datasets
def preprocess_regression_data(df, dataset_type='auto'):
    # Make a copy to avoid modifying the original
    data = df.copy()

    # Auto-detect dataset type based on columns
    if dataset_type == 'auto':
        if 'median_house_value' in data.columns:
            dataset_type = 'simple'
        elif 'Y house price of unit area' in data.columns:
            dataset_type = 'medium'
        elif 'price' in data.columns and 'furnishingstatus' in data.columns:
            dataset_type = 'complex'
        else:
            raise ValueError("Cannot auto-detect dataset type. Please specify manually.")

    scalers = {}

    print(f"Processing {dataset_type} regression dataset...")

    if dataset_type == 'simple':
        # Simple dataset (California Housing - housing.csv)
        target_column = 'median_house_value'
        
        # Handle categorical column: ocean_proximity
        categorical_cols = ['ocean_proximity']
        
        for col in categorical_cols:
            if col in data.columns:
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col].astype(str))
                scalers[f'{col}_encoder'] = le
        
        print(f"Encoded categorical columns: {categorical_cols}")

    elif dataset_type == 'medium':
        # Medium dataset (Real Estate Valuation - real_estate_valuation.csv)
        target_column = 'Y house price of unit area'
        
        # This dataset has no categorical variables, all numeric
        # Rename columns for better readability (optional)
        column_mapping = {
            'X1 transaction date': 'transaction_date',
            'X2 house age': 'house_age',
            'X3 distance to the nearest MRT station': 'distance_to_MRT',
            'X4 number of convenience stores': 'convenience_stores',
            'X5 latitude': 'latitude',
            'X6 longitude': 'longitude',
            'Y house price of unit area': 'price_per_unit_area'
        }
        
        # Only rename columns that exist
        rename_dict = {k: v for k, v in column_mapping.items() if k in data.columns}
        data.rename(columns=rename_dict, inplace=True)
        target_column = 'price_per_unit_area' if 'Y house price of unit area' in df.columns else target_column
        

    elif dataset_type == 'complex':
        # Complex dataset (Housing Sales - Housing.csv)
        target_column = 'price'
        
        # Handle categorical variables
        categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                          'airconditioning', 'prefarea', 'furnishingstatus']
        
        # Binary encode yes/no columns
        binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                      'airconditioning', 'prefarea']
        
        for col in binary_cols:
            if col in data.columns:
                data[col] = data[col].map({'yes': 1, 'no': 0})
                print(f"Binary encoded: {col}")
        
        # Label encode furnishingstatus
        if 'furnishingstatus' in data.columns:
            le = LabelEncoder()
            data['furnishingstatus'] = le.fit_transform(data['furnishingstatus'].astype(str))
            scalers['furnishingstatus_encoder'] = le

    else:
        raise ValueError(f"Unknown dataset type: {dataset_type}")

    # Check for missing values
    missing_count = data.isnull().sum().sum()
    if missing_count > 0:
        print(f"\nWarning: Found {missing_count} missing values. Filling appropriately...")
        for col in data.columns:
            if data[col].isnull().sum() > 0:
                if data[col].dtype in ['float64', 'int64']:
                    data[col].fillna(data[col].mean(), inplace=True)
                    print(f"  - Filled {col} with mean")
                else:
                    mode_val = data[col].mode()[0] if not data[col].mode().empty else 'Unknown'
                    data[col].fillna(mode_val, inplace=True)
                    print(f"  - Filled {col} with mode")


    return data, scalers, target_column


# Function to select and preprocess regression dataset
def select_regression_dataset(choice='simple', df_reg_simple=None, df_reg_med=None, df_reg_complex=None):
    if choice == 'simple':
        df = df_reg_simple
        dataset_name = "Simple California Housing Dataset"
    elif choice == 'medium':
        df = df_reg_med
        dataset_name = "Medium Real Estate Valuation Dataset"
    elif choice == 'complex':
        df = df_reg_complex
        dataset_name = "Complex Housing Sales Dataset"
    else:
        raise ValueError("Choice must be 'simple', 'medium', or 'complex'")
    
    if df is None:
        raise ValueError(f"Dataset for choice '{choice}' is None. Please provide the dataframe.")

    processed_data, scalers, target_column = preprocess_regression_data(df, choice)

    print(f"\n{'='*60}")
    print(f"{dataset_name}")
    print(f"{'='*60}")
    print(f"Original shape: {df.shape}")
    print(f"Processed shape: {processed_data.shape}")
    print(f"Target column: {target_column}")
    print(f"Target statistics:")
    print(f"  - Mean: {processed_data[target_column].mean():.2f}")
    print(f"  - Std: {processed_data[target_column].std():.2f}")
    print(f"  - Min: {processed_data[target_column].min():.2f}")
    print(f"  - Max: {processed_data[target_column].max():.2f}")
    print(f"\nFeatures ({len([col for col in processed_data.columns if col != target_column])}):")
    features = [col for col in processed_data.columns if col != target_column]
    for i, feat in enumerate(features, 1):
        print(f"  {i}. {feat}")
    print(f"{'='*60}\n")

    return processed_data, scalers, target_column, dataset_name


In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, task_type='classification'):
        super(SimpleNN, self).__init__()
        self.task_type = task_type
        self.output_size = output_size

        # Hidden layer
        self.hidden = nn.Linear(input_size, hidden_size)
        self.hidden_act = nn.LeakyReLU()

        # Output layer
        self.output = nn.Linear(hidden_size, output_size)

        # Final activations
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden_act(self.hidden(x))
        x = self.output(x)
        if self.task_type == 'classification':
            x = self.sigmoid(x)
        return x

    def predict_proba(self, x):
        self.eval()
        with torch.no_grad():
            out = self.forward(x)
            if self.output_size == 1:
                probs = out
                return torch.cat([1 - probs, probs], dim=1)
            else:
                return out

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error, r2_score
import math

class ActiveLearner:
    def __init__(self, model, optimizer, criterion, device='cpu'):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device

    def train_epoch(self, train_loader):
        self.model.train()
        total_loss = 0.0
        n_batches = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.to(self.device)
            batch_y = batch_y.to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(batch_x)
            
            # Fix tensor shape issues - ensure proper dimensions
            if outputs.dim() > 1:
                outputs = outputs.squeeze(1)  # Remove only dimension 1, not all
            if batch_y.dim() == 0:
                batch_y = batch_y.unsqueeze(0)  # Add batch dimension if missing
            
            # Ensure both tensors have the same shape
            outputs = outputs.view(-1)
            batch_y = batch_y.view(-1).float()
            
            loss = self.criterion(outputs, batch_y)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()
            n_batches += 1

        return total_loss / max(1, n_batches)

    def evaluate_classification(self, val_loader):
        self.model.eval()
        all_preds = []
        all_true = []
        val_loss = 0.0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device)
                outputs = self.model(batch_x)
                
                # Fix tensor shape issues
                if outputs.dim() > 1:
                    outputs = outputs.squeeze(1)
                if batch_y.dim() == 0:
                    batch_y = batch_y.unsqueeze(0)
                
                outputs = outputs.view(-1)
                batch_y = batch_y.view(-1).float()
                
                val_loss += self.criterion(outputs, batch_y).item()
                
                # Convert predictions properly
                preds = (outputs > 0.5).long().cpu().numpy()
                all_preds.extend(preds)
                all_true.extend(batch_y.cpu().numpy().astype(int))
        
        # Calculate metrics using sklearn
        accuracy = accuracy_score(all_true, all_preds)
        precision = precision_score(all_true, all_preds, average='weighted', zero_division=0)
        recall = recall_score(all_true, all_preds, average='weighted', zero_division=0)
        f1 = f1_score(all_true, all_preds, average='weighted', zero_division=0)
        
        avg_loss = val_loss / len(val_loader)
        return accuracy, avg_loss, precision, recall, f1
    

    def evaluate_regression(self, val_loader):
        self.model.eval()
        val_loss = 0.0
        all_preds = []
        all_true = []
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device)
                outputs = self.model(batch_x)
                
                # Fix tensor shape issues for regression
                if outputs.dim() > 1:
                    outputs = outputs.squeeze(1)
                if batch_y.dim() == 0:
                    batch_y = batch_y.unsqueeze(0)
                
                outputs = outputs.view(-1)
                batch_y = batch_y.view(-1).float()
                
                val_loss += self.criterion(outputs, batch_y).item()
                all_preds.extend(outputs.cpu().numpy().tolist())
                all_true.extend(batch_y.cpu().numpy().tolist())

        avg_loss = val_loss / max(1, len(val_loader))
        mse = mean_squared_error(all_true, all_preds)
        rmse = math.sqrt(mse)
        r2 = r2_score(all_true, all_preds)
        return mse, rmse, r2, avg_loss

    def uncertainty_sampling_least_confidence(self, pool_x, n_samples):
        self.model.eval()
        with torch.no_grad():
            probs = self.model.predict_proba(pool_x.to(self.device))
            confidence, _ = torch.max(probs, dim=1)
            # Handle case where n_samples > available samples
            actual_n_samples = min(n_samples, len(pool_x))
            _, idx = torch.topk(-confidence, actual_n_samples)
            return idx.cpu().numpy()
        


   

In [None]:
import copy

def run_active_learning_experiment(uncertainty_method='least_confidence', initial_size=1000, query_size=500, 
                                 n_iterations=10, epochs_per_iteration=10, learning_rate=0.001, 
                                 weight_decay=1e-5, hidden_size=64, type='classification'):
    # Validate parameters against dataset size
    pool_size = len(X_train_pool_tensor)
    if initial_size > pool_size:
        raise ValueError(f"initial_size ({initial_size}) cannot be larger than training pool size ({pool_size}). "
                        f"Recommended initial_size: {pool_size // 2}")
    
    # Initialize with a small random labeled set
    np.random.seed(42)
    initial_indices = np.random.choice(len(X_train_pool_tensor), initial_size, replace=False)


    # Split into labeled and unlabeled pools
    labeled_indices = set(initial_indices)
    unlabeled_indices = set(range(len(X_train_pool_tensor))) - labeled_indices
    
    # Convert to lists for easier manipulation
    labeled_indices = list(labeled_indices)
    unlabeled_indices = list(unlabeled_indices)
    
    input_size = X_train_pool_tensor.shape[1]
    model = SimpleNN(input_size, hidden_size=64, output_size=1, task_type=type)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=0.9)
    if type == 'classification':
        criterion = nn.BCELoss()
    else:
        criterion = nn.MSELoss()
    
    learner = ActiveLearner(model, optimizer, criterion)
    
    # Track performance
    history = []
    
    print(f"Starting Active Learning with {uncertainty_method} uncertainty sampling")
    print(f"Hyperparameters:")
    print(f"  Learning Rate: {learning_rate}")
    print(f"  Weight Decay: {weight_decay}")
    print(f"  Hidden Size: {hidden_size}")
    print(f"Initial labeled set size: {len(labeled_indices)}")
    print(f"Initial unlabeled pool size: {len(unlabeled_indices)}")
    
    for iteration in range(n_iterations):
        print(f"\n--- Iteration {iteration + 1}/{n_iterations} ---")
        
        # Create current training set
        labeled_X = X_train_pool_tensor[labeled_indices]
        labeled_y = y_train_pool_tensor[labeled_indices]
        
        # Create data loaders
        train_dataset = TensorDataset(labeled_X, labeled_y)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
        epoch_losses = []
        
        for epoch in range(epochs_per_iteration):
            train_loss = learner.train_epoch(train_loader)
            epoch_losses.append(train_loss)
            
            print(f"  Epoch {epoch+1}/{epochs_per_iteration}, Loss: {train_loss:.6f}")
            

        # Average training loss across epochs
        avg_train_loss = sum(epoch_losses) / len(epoch_losses)
        print(f"Avg Training Loss: {avg_train_loss:.4f}")

        # Evaluate on validation set
        if type == 'regression':
            val_mse, val_rmse, val_r2, val_loss = learner.evaluate_regression(val_loader)
            print(f"Val MSE: {val_mse:.4f}, Val RMSE: {val_rmse:.4f}, Val R2: {val_r2:.4f}, Val Loss: {val_loss:.4f}")
            # For regression, store MSE instead of accuracy
            val_accuracy = -val_mse  # Negative MSE so higher is better
            print(f"Labeled samples: {len(labeled_indices)}, Val Performance (-MSE): {val_accuracy:.4f}, Val Loss: {val_loss:.4f}")
        else:
            val_accuracy, val_loss, val_precision, val_recall, val_f1 = learner.evaluate_classification(val_loader)
            print(f"Labeled samples: {len(labeled_indices)}, Val Accuracy: {val_accuracy:.4f}, Val Loss: {val_loss:.4f}")
            print(f"Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}")

        # Store results
        history.append({
            'iteration': iteration + 1,
            'labeled_size': len(labeled_indices),
            'val_accuracy': val_accuracy,
            'val_loss': val_loss,
            'train_loss': epoch_losses,  
        })
        
        if iteration < n_iterations - 1 and unlabeled_indices:
            unlabeled_X = X_train_pool_tensor[unlabeled_indices]
            unlabeled_y = y_train_pool_tensor[unlabeled_indices] 
            
            # Adjust query_size if there aren't enough unlabeled samples
            actual_query_size = min(query_size, len(unlabeled_indices))
            if actual_query_size < query_size:
                print(f"  Warning: Only {len(unlabeled_indices)} unlabeled samples remaining, querying {actual_query_size} instead of {query_size}")
            
            if type == 'classification':
                query_indices = learner.uncertainty_sampling_least_confidence(unlabeled_X, actual_query_size)
              
           
            
            # Convert indices back to global
            
            if isinstance(query_indices, torch.Tensor):
                query_indices = query_indices.cpu().numpy()
            elif not isinstance(query_indices, np.ndarray):
                query_indices = np.array(query_indices)

            selected_global_indices = [unlabeled_indices[i] for i in query_indices]

            
            labeled_indices.extend(selected_global_indices)
            unlabeled_indices = list(set(unlabeled_indices) - set(selected_global_indices))
            
            print(f"Queried {len(selected_global_indices)} new samples")
            print(f"Remaining unlabeled samples: {len(unlabeled_indices)}")


    model.eval()
    train_eval_loss = 0.0
    batches = 0
    with torch.no_grad():
        for bx, by in train_loader:
            out = model(bx)
            train_eval_loss += criterion(out.squeeze(), by.float()).item()
            batches += 1
    train_eval_loss = train_eval_loss / max(1, batches)
    print('Train loss (eval-mode):', train_eval_loss)
    print('Val loss (from your evaluate()):', val_loss) 

    
    return model, history

In [None]:
# Working ensemble regression active learning
def ensemble_regression_active_learning(initial_size=600, query_size=120, n_iterations=3, 
                                        epochs_per_iteration=50, learning_rate=0.001):
    print("Starting ensemble regression active learning...")
    
    # Initialize
    np.random.seed(42)
    initial_indices = np.random.choice(len(X_train_pool_tensor), initial_size, replace=False)
    
    labeled_indices = list(initial_indices)
    unlabeled_indices = list(set(range(len(X_train_pool_tensor))) - set(labeled_indices))
    
    # Create main model
    input_size = X_train_pool_tensor.shape[1]
    model = SimpleNN(input_size, hidden_size=64, output_size=1, task_type='regression')
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)
    criterion = nn.MSELoss()
    
    history = []
    
    for iteration in range(n_iterations):
        print(f"\n--- Iteration {iteration + 1}/{n_iterations} ---")
        
        # Create training data
        labeled_X = X_train_pool_tensor[labeled_indices]
        labeled_y = y_train_pool_tensor[labeled_indices]
        
        train_dataset = TensorDataset(labeled_X, labeled_y)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        
        # Training main model
        model.train()
        epoch_losses = []  # Track epoch losses for history
        for epoch in range(epochs_per_iteration):
            epoch_loss = 0.0
            for batch_x, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                
                # Check for NaN in loss
                if torch.isnan(loss):
                    print(f"Warning: NaN loss detected at epoch {epoch}")
                    break
                    
                loss.backward()
                
                # Clip gradients to prevent explosion
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                epoch_loss += loss.item()
            
            avg_epoch_loss = epoch_loss / len(train_loader)
            epoch_losses.append(avg_epoch_loss)
            
            if epoch % 10 == 0:
                print(f"  Epoch {epoch}, Loss: {avg_epoch_loss:.6f}")
        
        # Evaluation
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_true = []
        with torch.no_grad():
            for i in range(0, len(X_val_tensor), 32):
                batch_x = X_val_tensor[i:i+32]
                batch_y = y_val_tensor[i:i+32]
                outputs = model(batch_x)
                val_loss += criterion(outputs.squeeze(), batch_y).item()
                all_preds.extend(outputs.squeeze().numpy())
                all_true.extend(batch_y.numpy())
        
        val_mse = val_loss / (len(X_val_tensor) // 32 + 1)
        val_r2 = r2_score(all_true, all_preds)
        
        print(f"Labeled: {len(labeled_indices)}, Val MSE: {val_mse:.4f}, Val R²: {val_r2:.4f}")
        
        history.append({
            'iteration': iteration + 1,
            'labeled_size': len(labeled_indices),
            'val_mse': val_mse,
            'val_r2': val_r2,
            'train_loss': epoch_losses  # Add epoch losses to match expected format
        })
        
        # Ensemble-based uncertainty sampling
        if iteration < n_iterations - 1 and len(unlabeled_indices) >= query_size:
            print("  Computing ensemble predictions for uncertainty sampling...")
            
            unlabeled_X = X_train_pool_tensor[unlabeled_indices]
            n_models = 3  # Smaller ensemble to avoid issues
            ensemble_preds = []
            
            for m in range(n_models):
                # Create a new model with different initialization
                ensemble_model = SimpleNN(input_size, hidden_size=64, output_size=1, task_type='regression')
                ensemble_optimizer = optim.SGD(ensemble_model.parameters(), lr=learning_rate, momentum=0.9)
                
                # Quick training (fewer epochs for ensemble diversity)
                ensemble_model.train()
                for e in range(20):  # Just 20 epochs for ensemble models
                    for batch_x, batch_y in train_loader:
                        ensemble_optimizer.zero_grad()
                        outputs = ensemble_model(batch_x)
                        loss = criterion(outputs.squeeze(), batch_y)
                        
                        if not torch.isnan(loss):
                            loss.backward()
                            torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), max_norm=1.0)
                            ensemble_optimizer.step()
                
                # Get predictions for unlabeled data
                ensemble_model.eval()
                with torch.no_grad():
                    preds = ensemble_model(unlabeled_X).squeeze().numpy()
                    
                    # Check for NaN predictions
                    if not np.isnan(preds).any():
                        ensemble_preds.append(preds)
                    else:
                        print(f"    Warning: Model {m} produced NaN, skipping")
                
                # Clean up model to prevent memory leaks
                del ensemble_model
                del ensemble_optimizer
            
            if len(ensemble_preds) >= 2:  # Need at least 2 valid models
                ensemble_preds = np.array(ensemble_preds)
                pred_variance = np.var(ensemble_preds, axis=0)
                
                # Select samples with highest prediction variance
                selected_idx = np.argsort(-pred_variance)[:query_size]
                selected_global_idx = [unlabeled_indices[i] for i in selected_idx]
                
                labeled_indices.extend(selected_global_idx)
                unlabeled_indices = list(set(unlabeled_indices) - set(selected_global_idx))
                print(f"  Added {query_size} samples using ensemble uncertainty")
            else:
                # Fallback to random sampling if ensemble failed
                selected_indices = np.random.choice(unlabeled_indices, query_size, replace=False)
                labeled_indices.extend(selected_indices)
                unlabeled_indices = list(set(unlabeled_indices) - set(selected_indices))
                print(f"  Fallback: Added {query_size} random samples")
    
    return model, history


In [None]:
# Active Learning with Uncertainty Sampling - Complete Implementation
def run_active_learning_with_visualization(uncertainty_method='least_confidence', initial_size=500, 
                                         query_size=200, n_iterations=6, epochs_per_iteration=5,
                                         learning_rate=0.001, weight_decay=1e-5, hidden_size=64, classification=True):
    
    # Run active learning experiment with specified hyperparameters
    print(f"Running Active Learning with {uncertainty_method} uncertainty sampling...")
    
    if classification:
        # Use general active learning for classification
        model, history = run_active_learning_experiment(
            uncertainty_method=uncertainty_method,
            initial_size=initial_size,
            query_size=query_size,
            n_iterations=n_iterations,
            epochs_per_iteration=epochs_per_iteration,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            hidden_size=hidden_size,
            type='classification',
        )
    else:
        # Use ensemble regression active learning for regression
        model, history = ensemble_regression_active_learning(
            initial_size=initial_size,
            query_size=query_size,
            n_iterations=n_iterations,
            epochs_per_iteration=epochs_per_iteration,
            learning_rate=learning_rate
        )
        # Convert history format to match expected format
        for item in history:
            item['val_accuracy'] = item['val_r2'] # Use R² as accuracy
            item['val_loss'] = item['val_mse']  # Use MSE as loss
            # Keep original train_loss from regression training, don't overwrite with val_mse
    
    # Evaluate on test set
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=0.9)
    
    if classification:
        learner = ActiveLearner(model, optimizer, nn.BCELoss())
        test_accuracy, test_loss, test_precision, test_recall, test_f1 = learner.evaluate_classification(test_loader)
        
        # Get predictions for classification report
        model.eval()
        all_predictions = []
        all_true_labels = []
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                outputs = model(batch_x)
                predicted = (outputs.squeeze() > 0.5).long()
                all_predictions.extend(predicted.cpu().numpy())
                all_true_labels.extend(batch_y.cpu().numpy())
        
        # Calculate test metrics
        test_metrics = {
            'accuracy': test_accuracy,
            'precision': test_precision,
            'recall': test_recall,
            'f1': test_f1
        }
    else:
        # Regression
        learner = ActiveLearner(model, optimizer, nn.MSELoss())
        test_mse, test_rmse, test_r2, test_loss = learner.evaluate_regression(test_loader)
        
        # Get predictions for regression
        model.eval()
        all_predictions = []
        all_true_labels = []
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                outputs = model(batch_x)
                all_predictions.extend(outputs.squeeze().cpu().numpy())
                all_true_labels.extend(batch_y.cpu().numpy())
        
        test_metrics = {
            'mse': test_mse,
            'rmse': test_rmse,
            'r2': test_r2
        }

    # Create visualizations
    plt.figure(figsize=(10, 6))
    labeled_sizes = [item['iteration'] for item in history]
    metric_values = [item['val_accuracy'] for item in history]
    plt.plot(labeled_sizes, metric_values, 'b-o', linewidth=3, markersize=8)
    plt.xlabel('Iteration')
    if classification:
        plt.ylabel('Validation Accuracy')
        plt.title('Active Learning Progress')
    else:
        plt.ylabel('Validation Performance (R²)')
        plt.title('Active Learning Progress (Higher = Better)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Plot 2: Validation loss trend across iterations
    plt.figure(figsize=(10, 6))
    iterations = [item['iteration'] for item in history]
    val_losses = [item['val_loss'] for item in history]
    plt.plot(iterations, val_losses, 'r-o', linewidth=2, markersize=6)
    plt.xlabel('Iteration')
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss (per iteration)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Plot 3: Per-iteration epoch losses placed in [i, i+1] intervals
    plt.figure(figsize=(12, 6))
    total_iters = len(history)
    cmap = plt.get_cmap('tab10')

    for idx, item in enumerate(history):
        epoch_losses = item['train_loss']
        m = len(epoch_losses)
        if m > 1:
            # map epochs into interval [idx, idx+1) so iteration 0 -> [0,1), iteration 1 -> [1,2), etc.
            xs = np.linspace(idx, idx + 1, m, endpoint=False)
        else:
            xs = np.array([idx + 0.5])
        plt.plot(xs, epoch_losses, color=cmap(idx % 10), alpha=0.8)
        # Mark validation loss at the right boundary of the interval (idx+1)
        plt.scatter([idx + 1], [item['val_loss']], color='k', marker='x', s=50)

    # Draw faint vertical separators to indicate iteration boundaries
    for k in range(total_iters + 1):
        plt.axvline(k, color='gray', alpha=0.2, linestyle='--')

    plt.xlim(0, max(1, total_iters))
    plt.xlabel('Iteration (each integer interval contains that iteration\'s epochs)')
    plt.ylabel('Training Loss')
    plt.title('Per-iteration Epoch Training Losses (0-1,1-2,...) with Validation markers')
    plt.grid(True, alpha=0.3)

    # Legend: custom elements
    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], color='gray', lw=2, label='Training (per-iteration lines)'),
                       Line2D([0], [0], marker='x', color='k', label='Validation (interval boundary)', linestyle='None')]
    plt.legend(handles=legend_elements)

    plt.tight_layout()
    plt.show()
    
    # Print results
    print(f"\nFinal Results:")
    if classification:
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Precision: {test_metrics['precision']:.4f}")
        print(f"Test Recall: {test_metrics['recall']:.4f}")  
        print(f"Test F1-Score: {test_metrics['f1']:.4f}")
        print(f"Test Loss: {test_loss:.4f}")
        
        # Dynamic target names based on dataset
        if target_column == 'Default':
            target_names = ['No Default', 'Default']
        elif target_column == 'loan_status':
            target_names = ['Approved', 'Rejected']
        elif target_column == 'LoanApproved':
            target_names = ['Rejected', 'Approved']
        else:
            target_names = ['Class 0', 'Class 1']
        
        print(f"\nClassification Report for {target_column}:")
        print(classification_report(all_true_labels, all_predictions, target_names=target_names))
    else:
        print(f"Test MSE: {test_metrics['mse']:.4f}")
        print(f"Test RMSE: {test_metrics['rmse']:.4f}")
        print(f"Test R²: {test_metrics['r2']:.4f}")
        print(f"Test Loss: {test_loss:.4f}")
        
        print(f"\nRegression Results for {target_column}:")
        print(f"Mean Absolute Error: {np.mean(np.abs(np.array(all_predictions) - np.array(all_true_labels))):.4f}")
        print(f"Target Range: {np.min(all_true_labels):.2f} to {np.max(all_true_labels):.2f}")
        print(f"Prediction Range: {np.min(all_predictions):.2f} to {np.max(all_predictions):.2f}")
    
    
    return model, history

In [None]:
def analyze_stopping_points(history, task_type='classification'):
    """
    Analyze training history to suggest optimal stopping points
    """
    iterations = [h['iteration'] for h in history]
    
    if task_type == 'classification':
        val_metric = [h['val_accuracy'] for h in history]
        val_losses = [h['val_loss'] for h in history]
        metric_name = 'Validation Accuracy'
    else:
        val_metric = [h['val_accuracy'] for h in history]  # This is R² for regression
        val_losses = [h['val_loss'] for h in history]  # This is MSE for regression
        metric_name = 'Validation R²'
    
    # Find best performance
    if task_type == 'classification':
        best_idx = np.argmax(val_metric)
    else:
        best_idx = np.argmax(val_metric)  # Higher R² is better
    
    best_iteration = iterations[best_idx]
    best_metric = val_metric[best_idx]
    
    # Check for plateau (less than 1% improvement in last 3 iterations)
    if len(val_metric) >= 3:
        recent_improvement = (val_metric[-1] - val_metric[-3]) / abs(val_metric[-3])
        plateau_detected = abs(recent_improvement) < 0.01
    else:
        plateau_detected = False
    
    print(f"\n=== Stopping Point Analysis ===")
    print(f"Best {metric_name}: {best_metric:.4f} at iteration {best_iteration}")
    print(f"Current {metric_name}: {val_metric[-1]:.4f} at iteration {iterations[-1]}")
    print(f"Plateau detected: {plateau_detected}")
    
    if plateau_detected:
        print("RECOMMENDATION: Consider stopping - performance has plateaued")
    elif best_iteration == iterations[-1]:
        print("RECOMMENDATION: Continue training - still improving")
    else:
        print(f"RECOMMENDATION: Consider stopping - best performance was at iteration {best_iteration}")
    
    return best_iteration, best_metric, plateau_detected

# best_iter, best_perf, plateau = analyze_stopping_points(history, 'classification')

In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'simple'  # 'simple', 'medium', or 'complex'
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_classification_dataset(DATASET_CHOICE)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split


X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

# Replace StandardScaler with MinMaxScaler
scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = scaler.fit_transform(X_train_pool)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors - Use FloatTensor for BCELoss compatibility
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool)  # Changed to FloatTensor for BCELoss
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val)  # Changed to FloatTensor for BCELoss
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test)  # Changed to FloatTensor for BCELoss


# Store metadata for later use
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'classes': len(np.unique(y))
}


In [None]:
# Calculate optimal parameters for 80% training pool utilization
training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.05 * training_pool_size)  
query_size = 5
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1500 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (5%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")

model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=n_iterations,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.01,                   
    weight_decay=0.0001,                    
    hidden_size=32,               
    classification=True
)

best_iter, best_perf, plateau = analyze_stopping_points(history, 'classification')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")

model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=best_iter,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.01,                   
    weight_decay=0.0001,                    
    hidden_size=32,               
    classification=True
)


In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'medium'  # 'simple', 'medium', or 'complex'
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_classification_dataset(DATASET_CHOICE)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split

# Initial split: 60% pool, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

# Replace StandardScaler with MinMaxScaler
scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = scaler.fit_transform(X_train_pool)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors - Use FloatTensor for BCELoss compatibility
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool)  # Changed to FloatTensor for BCELoss
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val)  # Changed to FloatTensor for BCELoss
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test)  # Changed to FloatTensor for BCELoss


# Store metadata for later use
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'classes': len(np.unique(y))
}

training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.05 * training_pool_size)  
query_size = 5
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1000 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (5%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")



model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=n_iterations,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.01,                   
    weight_decay=0.01,                    
    hidden_size=256,               
    classification=True
)


best_iter, best_perf, plateau = analyze_stopping_points(history, 'classification')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")

model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=best_iter,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.01,                   
    weight_decay=0.01,                    
    hidden_size=256,               
    classification=True
)

In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'complex'  # 'simple', 'medium', or 'complex'
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_classification_dataset(DATASET_CHOICE)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split

# Initial split: 60% pool, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

# Replace StandardScaler with MinMaxScaler
scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = scaler.fit_transform(X_train_pool)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors - Use FloatTensor for BCELoss compatibility
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool)  # Changed to FloatTensor for BCELoss
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val)  # Changed to FloatTensor for BCELoss
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test)  # Changed to FloatTensor for BCELoss


# Store metadata for later use
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'classes': len(np.unique(y))
}

training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.05 * training_pool_size)  
query_size = 5
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1000 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (5%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")



model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=n_iterations,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.005,                   
    weight_decay=0.0001,                    
    hidden_size=128,               
    classification=True
)

best_iter, best_perf, plateau = analyze_stopping_points(history, 'classification')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")

model, history = run_active_learning_with_visualization(
    uncertainty_method='least_confidence', 
    initial_size=initial_size,   
    query_size=query_size,       
    n_iterations=best_iter,    
    epochs_per_iteration=epochs_per_iteration,               
    learning_rate=0.005,                   
    weight_decay=0.0001,                    
    hidden_size=128,               
    classification=True
)

In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'simple'  
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_regression_dataset(choice='simple', df_reg_simple=df_reg_simple, df_reg_med=df_reg_med, df_reg_complex=df_reg_complex)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split

# Initial split: 60% pool, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale features
feature_scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = feature_scaler.fit_transform(X_train_pool)
X_val_scaled = feature_scaler.transform(X_val)
X_test_scaled = feature_scaler.transform(X_test)

# Scale targets to [0,1] range
target_scaler = MinMaxScaler()
y_train_pool_scaled = target_scaler.fit_transform(y_train_pool.reshape(-1, 1)).flatten()
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1)).flatten()

print(f"Original target range: {y_train_pool.min():.2f} to {y_train_pool.max():.2f}")
print(f"Scaled target range: {y_train_pool_scaled.min():.3f} to {y_train_pool_scaled.max():.3f}")

# Convert to PyTorch tensors - USE FLOAT TENSORS FOR REGRESSION!
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool_scaled)  # Use scaled targets
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val_scaled)  # Use scaled targets
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test_scaled)  # Use scaled targets

print(f"Data shapes:")
print(f"X_train_pool: {X_train_pool_tensor.shape}")
print(f"y_train_pool: {y_train_pool_tensor.shape} (dtype: {y_train_pool_tensor.dtype})")
print(f"Target values range: {y_train_pool_tensor.min():.3f} to {y_train_pool_tensor.max():.3f}")

# Store metadata for later use (including scalers for inverse transform)
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'target_range': f"{y.min():.2f} - {y.max():.2f}",
    'feature_scaler': feature_scaler,
    'target_scaler': target_scaler
}

In [None]:
training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.10 * training_pool_size)  
query_size = int(0.01 * training_pool_size)  
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1500 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (10%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")



model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=n_iterations,                         
    epochs_per_iteration=epochs_per_iteration,  # Reduced from 1000              
    learning_rate=0.01,       # Much smaller learning rate               
    weight_decay=0.01,        # Add some regularization                    
    hidden_size=64,                          
    classification=False
)

best_iter, best_perf, plateau = analyze_stopping_points(history, 'regression')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")


model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=best_iter,                         
    epochs_per_iteration=epochs_per_iteration,  # Reduced from 1000              
    learning_rate=0.01,       # Much smaller learning rate               
    weight_decay=0.01,        # Add some regularization                    
    hidden_size=64,                          
    classification=False
)


In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'medium'  
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_regression_dataset(choice='medium', df_reg_simple=df_reg_simple, df_reg_med=df_reg_med, df_reg_complex=df_reg_complex)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split

# Initial split: 60% pool, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale features
feature_scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = feature_scaler.fit_transform(X_train_pool)
X_val_scaled = feature_scaler.transform(X_val)
X_test_scaled = feature_scaler.transform(X_test)

# Scale targets to [0,1] range
target_scaler = MinMaxScaler()
y_train_pool_scaled = target_scaler.fit_transform(y_train_pool.reshape(-1, 1)).flatten()
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1)).flatten()

print(f"Original target range: {y_train_pool.min():.2f} to {y_train_pool.max():.2f}")
print(f"Scaled target range: {y_train_pool_scaled.min():.3f} to {y_train_pool_scaled.max():.3f}")

# Convert to PyTorch tensors - USE FLOAT TENSORS FOR REGRESSION!
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool_scaled)  # Use scaled targets
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val_scaled)  # Use scaled targets
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test_scaled)  # Use scaled targets

print(f"Data shapes:")
print(f"X_train_pool: {X_train_pool_tensor.shape}")
print(f"y_train_pool: {y_train_pool_tensor.shape} (dtype: {y_train_pool_tensor.dtype})")
print(f"Target values range: {y_train_pool_tensor.min():.3f} to {y_train_pool_tensor.max():.3f}")

# Store metadata for later use (including scalers for inverse transform)
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'target_range': f"{y.min():.2f} - {y.max():.2f}",
    'feature_scaler': feature_scaler,
    'target_scaler': target_scaler
}

training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.05 * training_pool_size)  
query_size = 5  
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1500 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (5%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")



model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=n_iterations,                         
    epochs_per_iteration=epochs_per_iteration,  # Reduced from 1000              
    learning_rate=0.01,       # Much smaller learning rate               
    weight_decay=0.01,        # Add some regularization                    
    hidden_size=64,                          
    classification=False
)

best_iter, best_perf, plateau = analyze_stopping_points(history, 'regression')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")

model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=best_iter,                         
    epochs_per_iteration=epochs_per_iteration,  # Reduced from 1000              
    learning_rate=0.01,       # Much smaller learning rate               
    weight_decay=0.01,        # Add some regularization                    
    hidden_size=64,                          
    classification=False
)

In [None]:
from sklearn.preprocessing import MinMaxScaler
DATASET_CHOICE = 'complex'  
# Get the selected dataset
processed_data, label_encoders, target_column, dataset_name = select_regression_dataset(choice='complex', df_reg_simple=df_reg_simple, df_reg_med=df_reg_med, df_reg_complex=df_reg_complex)

# Prepare features and target
X = processed_data.drop(target_column, axis=1).values
y = processed_data[target_column].values

# Split the data
from sklearn.model_selection import train_test_split

# Initial split: 60% pool, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_pool, X_val, y_train_pool, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale features
feature_scaler = MinMaxScaler()  # Scales to [0,1] range
X_train_pool_scaled = feature_scaler.fit_transform(X_train_pool)
X_val_scaled = feature_scaler.transform(X_val)
X_test_scaled = feature_scaler.transform(X_test)

# Scale targets to [0,1] range
target_scaler = MinMaxScaler()
y_train_pool_scaled = target_scaler.fit_transform(y_train_pool.reshape(-1, 1)).flatten()
y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1)).flatten()

print(f"Original target range: {y_train_pool.min():.2f} to {y_train_pool.max():.2f}")
print(f"Scaled target range: {y_train_pool_scaled.min():.3f} to {y_train_pool_scaled.max():.3f}")

# Convert to PyTorch tensors - USE FLOAT TENSORS FOR REGRESSION!
import torch
X_train_pool_tensor = torch.FloatTensor(X_train_pool_scaled)
y_train_pool_tensor = torch.FloatTensor(y_train_pool_scaled)  # Use scaled targets
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val_scaled)  # Use scaled targets
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test_scaled)  # Use scaled targets

print(f"Data shapes:")
print(f"X_train_pool: {X_train_pool_tensor.shape}")
print(f"y_train_pool: {y_train_pool_tensor.shape} (dtype: {y_train_pool_tensor.dtype})")
print(f"Target values range: {y_train_pool_tensor.min():.3f} to {y_train_pool_tensor.max():.3f}")

# Store metadata for later use (including scalers for inverse transform)
dataset_info = {
    'name': dataset_name,
    'choice': DATASET_CHOICE,
    'target_column': target_column,
    'features': X.shape[1],
    'samples': X.shape[0],
    'target_range': f"{y.min():.2f} - {y.max():.2f}",
    'feature_scaler': feature_scaler,
    'target_scaler': target_scaler
}

training_pool_size = len(X_train_pool_tensor)
target_samples = int(0.8 * training_pool_size)  
initial_size = int(0.05 * training_pool_size)  
query_size = 5   
n_iterations =  (target_samples - initial_size) // query_size + 1 
epochs_per_iteration = 1500 // n_iterations
print(f"Training pool size: {training_pool_size}")
print(f"Target samples (80%): {target_samples}")
print(f"Initial size (5%): {initial_size}")
print(f"Query size: {query_size}")
print(f"Number of iterations: {n_iterations}")
print(f"Final labeled size: {initial_size + (n_iterations-1) * query_size}")



model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=n_iterations,                         
    epochs_per_iteration=epochs_per_iteration,  
    learning_rate=0.05,       
    weight_decay=0.001,        
    hidden_size=64,
    classification=False
)

best_iter, best_perf, plateau = analyze_stopping_points(history, 'regression')
print(f"Best iteration: {best_iter}, Best performance: {best_perf:.4f}, Plateau: {plateau}")


model, history = run_active_learning_with_visualization(
    uncertainty_method='ensemble_regression',  
    initial_size=initial_size,                       
    query_size=query_size,                         
    n_iterations=best_iter,                         
    epochs_per_iteration=epochs_per_iteration,  
    learning_rate=0.05,       
    weight_decay=0.001,        
    hidden_size=64,
    classification=False
)
