# Active Learning Experiments

This notebook demonstrates the core active learning experiments comparing different strategies on our biomedical datasets.

## Experimental Design:

### Datasets:
1. **BBB (Blood-Brain Barrier)**: 1,976 molecular samples, PCA-reduced features
2. **Breast Cancer**: 569 clinical samples, original 30 features

### Active Learning Strategies:
1. **Random Forest (RF)** with uncertainty sampling
2. **Query-by-Committee (QBC)** with vote entropy

### Sampling Methods:
- **First 5**: Start with first 5 samples 
- **Stratified 5**: Start with 5 stratified samples

### Evaluation:
- Matthews Correlation Coefficient (MCC)
- F1 Score  
- ROC AUC
- Delta MCC improvement vs full model

In [ ]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed

# Machine learning
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import matthews_corrcoef, f1_score, roc_auc_score
from scipy.stats import mode, entropy

# Custom modules
from active_learning.strategies import UncertaintySampling, QBCVoteEntropy
from active_learning.learners import RandomForestAL, QueryByCommitteeAL
from active_learning.experiments import ALExperiment
from evaluation.metrics import ModelEvaluator

print("Libraries imported successfully!")

## 1. Load Processed Data

In [ ]:
# Load preprocessed data
processed_dir = "../data/processed"

# BBB Dataset (use PCA-reduced features)
try:
    X_bbb_train = np.load(f"{processed_dir}/X_bbb_train_pca.npy")
    X_bbb_test = np.load(f"{processed_dir}/X_bbb_test_pca.npy")
    print("Loaded PCA-reduced BBB data")
except FileNotFoundError:
    # Fallback to original features if PCA not available
    X_bbb_train = np.load(f"{processed_dir}/X_bbb_train.npy")
    X_bbb_test = np.load(f"{processed_dir}/X_bbb_test.npy")
    print("Loaded original BBB data (PCA not found)")

y_bbb_train = np.load(f"{processed_dir}/y_bbb_train.npy")
y_bbb_test = np.load(f"{processed_dir}/y_bbb_test.npy")

# Breast Cancer Dataset
X_bc_train = np.load(f"{processed_dir}/X_bc_train.npy")
X_bc_test = np.load(f"{processed_dir}/X_bc_test.npy")
y_bc_train = np.load(f"{processed_dir}/y_bc_train.npy")
y_bc_test = np.load(f"{processed_dir}/y_bc_test.npy")

print("Dataset shapes:")
print(f"BBB - Train: {X_bbb_train.shape}, Test: {X_bbb_test.shape}")
print(f"BC  - Train: {X_bc_train.shape}, Test: {X_bc_test.shape}")
print(f"Class distributions:")
print(f"BBB train: {np.bincount(y_bbb_train)}, test: {np.bincount(y_bbb_test)}")
print(f"BC train:  {np.bincount(y_bc_train)}, test: {np.bincount(y_bc_test)}")

## 2. Experimental Configuration

In [ ]:
# Active Learning Configuration
AL_CONFIG_BBB = {
    'max_queries': -1,      # No query limit
    'stop_ratio': 1.0,      # Use 100% of pool
    'batch_size': 20,       # 20 samples per iteration
    'n_runs': 5,            # Reduced for notebook demo (original: 10)
    'stratified_seeds': [42, 10, 50, 100],
    'rf_params': {'n_estimators': 100, 'random_state': 42},
    'qbc_params': {},
    'n_jobs': min(4, multiprocessing.cpu_count())  # Limit parallel jobs for notebook
}

AL_CONFIG_BC = {
    'max_queries': -1,
    'stop_ratio': 1.0,
    'batch_size': 10,       # 10 samples per iteration
    'n_runs': 5,            # Reduced for notebook demo
    'stratified_seeds': [42, 10, 50, 100],
    'rf_params': {'n_estimators': 100, 'random_state': 42},
    'qbc_params': {},
    'n_jobs': min(4, multiprocessing.cpu_count())
}

print("Active Learning Configuration:")
print(f"BBB Config: {AL_CONFIG_BBB}")
print(f"BC Config:  {AL_CONFIG_BC}")
print(f"Available CPU cores: {multiprocessing.cpu_count()}")

## 3. Helper Functions for Active Learning

Due to the complexity of active learning experiments, this notebook demonstrates the key concepts. For full experiments, use the scripts in the `scripts/` directory.

In [ ]:
# Quick demonstration of active learning concept
def demonstrate_al_concept(X_train, y_train, X_test, y_test, dataset_name):
    """Demonstrate basic active learning concept"""
    
    print(f"\n=== {dataset_name} Active Learning Demo ===")
    
    # 1. Train full model (baseline)
    rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_full.fit(X_train, y_train)
    
    full_predictions = rf_full.predict(X_test)
    full_mcc = matthews_corrcoef(y_test, full_predictions)
    full_f1 = f1_score(y_test, full_predictions, average='weighted')
    
    print(f"Full Model (trained on {len(X_train)} samples):")
    print(f"  MCC: {full_mcc:.4f}")
    print(f"  F1:  {full_f1:.4f}")
    
    # 2. Simulate active learning with small initial set
    initial_size = 10
    al_indices = list(range(initial_size))  # First 10 samples
    
    rf_al = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_al.fit(X_train[al_indices], y_train[al_indices])
    
    al_predictions = rf_al.predict(X_test)
    al_mcc = matthews_corrcoef(y_test, al_predictions)
    al_f1 = f1_score(y_test, al_predictions, average='weighted')
    
    print(f"Active Learning Model (trained on {len(al_indices)} samples):")
    print(f"  MCC: {al_mcc:.4f}")
    print(f"  F1:  {al_f1:.4f}")
    
    # Performance comparison
    mcc_diff = al_mcc - full_mcc
    print(f"Performance difference (AL - Full): {mcc_diff:.4f}")
    
    if abs(mcc_diff) < 0.05:
        print("✓ Active learning achieved comparable performance with much less data!")
    elif mcc_diff > 0:
        print("✓ Active learning outperformed the full model!")
    else:
        print("• Active learning performance below full model (expected with limited data)")
        
    return {
        'full_mcc': full_mcc,
        'al_mcc': al_mcc,
        'full_f1': full_f1,
        'al_f1': al_f1,
        'samples_used': len(al_indices),
        'total_samples': len(X_train)
    }

# Run demonstrations
bbb_demo = demonstrate_al_concept(X_bbb_train, y_bbb_train, X_bbb_test, y_bbb_test, "BBB Dataset")
bc_demo = demonstrate_al_concept(X_bc_train, y_bc_train, X_bc_test, y_bc_test, "Breast Cancer Dataset")

## 4. For Complete Experiments

For full active learning experiments with multiple runs, parallel processing, and comprehensive evaluation, use the command-line scripts:

```bash
# Run complete experiments
cd ../scripts

# Prepare data
python prepare_data.py --datasets bbb bc --output-dir ../data/processed

# Run active learning experiments
python run_experiments.py --datasets bbb bc --strategies rf qbc --sampling first5 stratified --runs 10

# Evaluate results
python evaluate.py --input-dir ../results --output-dir ../results/analysis

# Generate reports
python generate_report.py --input-dir ../results --output-dir ../results/reports
```

This notebook demonstrates the core concepts. The scripts provide the full experimental pipeline used in the research.