# Better version, with first augmentaion, then rest of the work <Final Solution)

In [5]:
# Medical Image Classification using Logistic Regression with VGG16 Feature Extraction
# This code classifies 4 skin diseases: Monkeypox, Pemphigus, Seborrheic keratosis, Squamous cell carcinoma

# !pip install opencv-contrib-python scikit-learn
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.applications import VGG16
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from PIL import Image
import shutil
import cv2
import pickle
import gc  # For garbage collection
import math
import time
from sklearn.neighbors import KNeighborsClassifier
# ========================================================================================
# SECTION 1: BASIC SETUP AND PATHS
# ========================================================================================

# Define paths
original_dataset_path = '/kaggle/input/four-diseases-dataset/Dataset'
augmented_dataset_path = '/kaggle/working/augmented_dataset'

# Set image size and batch size
img_size = (224, 224)  # Resize all images to 224x224 (VGG16 input requirement)
batch_size = 32        # Load 32 images at a time during training (memory management)

# Class folder names - the 4 diseases we want to classify
diseases = ['Monkeypox', 'Pemphigus', 'Seborrheic keratosis', 'Squamous cell carcinoma']

# ========================================================================================
# SECTION 2: DATA AUGMENTATION TO BALANCE DATASET
# ========================================================================================

print("🚀 Starting data augmentation process...")

# Create output directory
os.makedirs(augmented_dataset_path, exist_ok=True)

# Define augmentation generator - creates variations of existing images
augmenter = ImageDataGenerator(
    rotation_range=20,        # Rotate images by up to 20 degrees
    width_shift_range=0.1,    # Shift images horizontally by up to 10%
    height_shift_range=0.1,   # Shift images vertically by up to 10%
    zoom_range=0.2,          # Zoom in/out by up to 20%
    horizontal_flip=True,     # Randomly flip images horizontally
    fill_mode='nearest'       # Fill empty pixels with nearest neighbor values
)

target_total = 500  # Target number of images per class

# Process each disease class
for disease in diseases:
    print(f"\n📂 Processing {disease}...")
    
    # Set up input and output directories for this disease
    input_class_dir = os.path.join(original_dataset_path, disease)
    output_class_dir = os.path.join(augmented_dataset_path, disease)
    os.makedirs(output_class_dir, exist_ok=True)

    # Get list of all image files in this class
    image_files = [f for f in os.listdir(input_class_dir) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
    original_count = len(image_files)
    
    print(f"   📊 Found {original_count} original images")
    
    if original_count == 0:
        print(f"   ⚠️  No images found in {disease} folder!")
        continue

    # Copy all original images to output directory
    print(f"   📦 Copying {original_count} original images...")
    for img_file in image_files:
        src = os.path.join(input_class_dir, img_file)
        dst = os.path.join(output_class_dir, img_file)
        try:
            shutil.copy2(src, dst)  # copy2 preserves metadata
        except Exception as e:
            print(f"   ❌ Error copying {img_file}: {e}")

    # Calculate how many augmented images we need
    extra_needed = max(0, target_total - original_count)
    
    if extra_needed == 0:
        print(f"   ✅ {disease} already has {original_count} images (>= target)")
        continue
    
    print(f"   🔄 Need to generate {extra_needed} augmented images...")
    
    # Calculate how many augmented versions per original image
    augs_per_image = max(1, extra_needed // original_count)
    remaining_augs = extra_needed % original_count
    
    count = 0
    
    # Generate augmented images
    for i, img_file in enumerate(image_files):
        if count >= extra_needed:
            break
            
        img_path = os.path.join(input_class_dir, img_file)
        
        try:
            # Load original image
            img = load_img(img_path, target_size=img_size)
            x = img_to_array(img)
            x = np.expand_dims(x, axis=0)

            # Determine how many augmentations for this image
            num_augs = augs_per_image + (1 if i < remaining_augs else 0)
            
            # Generate augmented versions
            aug_iter = augmenter.flow(x, batch_size=1)
            
            for j in range(num_augs):
                if count >= extra_needed:
                    break
                    
                try:
                    aug_img = next(aug_iter)[0].astype('uint8')
                    aug_pil = Image.fromarray(aug_img)
                    
                    # Create unique filename
                    base_name = os.path.splitext(img_file)[0]
                    aug_name = f"{base_name}_aug_{j+1}.jpg"
                    aug_path = os.path.join(output_class_dir, aug_name)
                    
                    aug_pil.save(aug_path, 'JPEG', quality=95)
                    count += 1
                    
                    if count % 50 == 0:  # Progress update
                        print(f"   📈 Generated {count}/{extra_needed} augmented images...")
                        
                except Exception as e:
                    print(f"   ❌ Error augmenting {img_file} (version {j+1}): {e}")
                    continue

        except Exception as e:
            print(f"   ❌ Error loading {img_file}: {e}")
            continue
    
    # Final count verification
    final_count = len([f for f in os.listdir(output_class_dir) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))])
    print(f"   ✅ {disease} final count: {final_count} images")
    
    # Clear memory
    gc.collect()

print("\n🎉 Data augmentation completed!")

# ========================================================================================
# SECTION 3: CREATE DATA GENERATORS WITH AUGMENTED DATASET
# ========================================================================================

print("\n🔄 Creating data generators with augmented dataset...")

# Create the ImageDataGenerator with normalization and validation split
datagen = ImageDataGenerator(
    rescale=1./255,         # Normalize pixel values from [0,255] to [0,1]
    validation_split=0.2,   # Reserve 20% of data for validation
    horizontal_flip=True,   # Additional augmentation for training
    zoom_range=0.1          # Additional augmentation for training
)

# Create training generator with augmented dataset
train_generator = datagen.flow_from_directory(
    augmented_dataset_path,     # Use augmented dataset
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='training',          # 80% for training
    shuffle=True
)

# Create validation generator with augmented dataset
val_generator = datagen.flow_from_directory(
    augmented_dataset_path,     # Use augmented dataset
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',        # 20% for validation
    shuffle=False
)

# Print class information
print("\n📋 Class labels (folder name → label index):")
print("Training:", train_generator.class_indices)
print("Validation:", val_generator.class_indices)

print(f"\n📊 Dataset Summary:")
print(f"Training samples: {train_generator.samples}")
print(f"Validation samples: {val_generator.samples}")
print(f"Number of classes: {train_generator.num_classes}")

# ========================================================================================
# SECTION 4: VERIFY AUGMENTED DATASET
# ========================================================================================

print("\n🔍 Verifying augmented dataset balance:")
for disease in diseases:
    disease_dir = os.path.join(augmented_dataset_path, disease)
    if os.path.exists(disease_dir):
        count = len([f for f in os.listdir(disease_dir) 
                    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))])
        print(f"   {disease}: {count} images")
    else:
        print(f"   {disease}: Directory not found!")

print("\n✅ Setup complete! Ready for feature extraction and model training.")

# ========================================================================================
# SECTION 5: SAMPLE VISUALIZATION (Optional)
# ========================================================================================

def visualize_samples(generator, num_samples=8):
    """Visualize sample images from the generator"""
    plt.figure(figsize=(15, 8))
    
    # Get a batch of images and labels
    images, labels = next(generator)
    
    for i in range(min(num_samples, len(images))):
        plt.subplot(2, 4, i+1)
        plt.imshow(images[i])
        
        # Get class name from label
        class_idx = np.argmax(labels[i])
        class_names = list(generator.class_indices.keys())
        class_name = class_names[class_idx]
        
        plt.title(f'{class_name}')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# Uncomment to visualize samples
# print("\n🖼️ Sample images from training set:")
# visualize_samples(train_generator)

print("\n🎯 Next steps:")
print("1. Extract features using VGG16")
print("2. Train logistic regression classifier")
print("3. Evaluate model performance")


# ========================================================================================
# SECTION 6: FEATURE EXTRACTION USING VGG16
# ========================================================================================

print("\n" + "="*50)
print("🔬 FEATURE EXTRACTION WITH VGG16")
print("="*50)

# Load pre-trained VGG16 model for feature extraction
print("📥 Loading pre-trained VGG16 model for feature extraction...")

base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# weights='imagenet': Use pre-trained weights from ImageNet dataset
# include_top=False: Remove the final classification layer (we'll use our own classifier)
# input_shape: Expected input image size (224x224x3 for RGB)

def extract_features(directory, sample_size=None):
    """
    Extract features from images using VGG16
    
    Args:
        directory: Path to folder containing disease subfolders
        sample_size: Maximum number of images to process per class (None = all)
    
    Returns:
        features: Numpy array of extracted features
        labels: Numpy array of corresponding class labels
    """
    features = []  # Store extracted features
    labels = []    # Store corresponding labels
    
    # Process each disease class
    for class_name in diseases:
        class_dir = os.path.join(directory, class_name)
        if not os.path.exists(class_dir):  # Skip if folder doesn't exist
            print(f"⚠️  Warning: {class_dir} does not exist!")
            continue
            
        # Get list of image files in this class
        image_files = [f for f in os.listdir(class_dir) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
        
        # Limit sample size if specified (for faster processing)
        if sample_size:
            image_files = image_files[:sample_size]
        
        print(f"🔍 Processing {len(image_files)} images from {class_name}...")
        
        # Process each image in this class
        for img_file in image_files:
            img_path = os.path.join(class_dir, img_file)
            try:
                # Load and preprocess image
                img = load_img(img_path, target_size=(224, 224))     # Load as 224x224
                x = img_to_array(img)                                # Convert to array
                x = np.expand_dims(x, axis=0)                        # Add batch dimension
                x = tf.keras.applications.vgg16.preprocess_input(x)  # VGG16 specific preprocessing
                
                # Extract features using VGG16 (everything except final classification)
                feature = base_model.predict(x, verbose=0)  # Get feature vector
                features.append(feature.flatten())         # Flatten to 1D array
                labels.append(class_name)                   # Store class name
                
            except Exception as e:
                print(f"❌ Error processing {img_file}: {e}")
    
    return np.array(features), np.array(labels)

# Extract features from augmented dataset
print("\n📊 Extracting features from augmented dataset...")



# Use 200 samples per class for faster processing (you can increase this)

X, y = extract_features(augmented_dataset_path, sample_size=200)

print(f"✅ Features extracted: {X.shape}")  # Shape: (total_samples, feature_dimensions)
print(f"✅ Labels: {len(y)}")               # Total number of labels

# Check if we have data
if len(X) == 0:
    print("❌ No features extracted! Check your dataset paths and image files.")
    exit()

# Encode labels - Convert text labels to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"📋 Label mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"   {class_name} → {i}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,           # Features and labels
    test_size=0.2,          # 20% for testing
    random_state = 42,        # For reproducible results
    stratify=y_encoded      # Ensure equal representation of each class in train/test
)

print(f"📈 Training set: {X_train.shape[0]} samples")
print(f"📊 Testing set: {X_test.shape[0]} samples")



🚀 Starting data augmentation process...

📂 Processing Monkeypox...
   📊 Found 100 original images
   📦 Copying 100 original images...
   🔄 Need to generate 400 augmented images...
   📈 Generated 50/400 augmented images...
   📈 Generated 100/400 augmented images...
   📈 Generated 150/400 augmented images...
   📈 Generated 200/400 augmented images...
   📈 Generated 250/400 augmented images...
   📈 Generated 300/400 augmented images...
   📈 Generated 350/400 augmented images...
   📈 Generated 400/400 augmented images...
   ✅ Monkeypox final count: 500 images

📂 Processing Pemphigus...
   📊 Found 100 original images
   📦 Copying 100 original images...
   🔄 Need to generate 400 augmented images...
   📈 Generated 50/400 augmented images...
   📈 Generated 100/400 augmented images...
   📈 Generated 150/400 augmented images...
   📈 Generated 200/400 augmented images...
   📈 Generated 250/400 augmented images...
   📈 Generated 300/400 augmented images...
   📈 Generated 350/400 augmented images..

# Compact and easy to understand model code

In [None]:
# Enhanced ML Models for Medical Image Classification with VGG16 Features
# Diseases: Monkeypox, Pemphigus, Seborrheic keratosis, Squamous cell carcinoma

# COMPLETELY FIX OpenBLAS threading warnings - MUST BE FIRST
import os
import sys

# Set ALL possible threading environment variables BEFORE importing numpy/sklearn
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'
os.environ['NUMBA_NUM_THREADS'] = '1'
os.environ['BLAS_NUM_THREADS'] = '1'
os.environ['LAPACK_NUM_THREADS'] = '1'
os.environ['ATLAS_NUM_THREADS'] = '1'
os.environ['GOTO_NUM_THREADS'] = '1'
os.environ['ACCELERATE_NUM_THREADS'] = '1'

# Additional OpenBLAS specific settings
os.environ['OPENBLAS_VERBOSE'] = '0'
os.environ['OPENBLAS_MAIN_FREE'] = '1'

# Force single-threaded execution
import threading
threading.current_thread().name = 'MainThread'

# Disable OpenMP if available
try:
    import mkl
    mkl.set_num_threads(1)
except ImportError:
    pass

# Import warnings control early
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, f1_score
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.model_selection import cross_val_score, GridSearchCV
import time
import joblib
import psutil
from contextlib import contextmanager
import subprocess

# Additional warning suppression
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

@contextmanager
def suppress_blas_warnings():
    """Context manager to completely suppress BLAS warnings"""
    # Redirect stderr temporarily
    import sys
    from io import StringIO
    
    old_stderr = sys.stderr
    sys.stderr = StringIO()
    
    try:
        yield
    finally:
        sys.stderr = old_stderr

def validate_data(X_train, X_test, y_train, y_test):
    """Validate input data before training"""
    print("🔍 Validating data...")
    
    # Check if data exists
    if any(data is None for data in [X_train, X_test, y_train, y_test]):
        raise ValueError("❌ Missing required data. Please ensure X_train, X_test, y_train, y_test are defined.")
    
    # Check shapes
    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("❌ Feature dimension mismatch between train and test sets.")
    
    if len(X_train) != len(y_train) or len(X_test) != len(y_test):
        raise ValueError("❌ Sample count mismatch between features and labels.")
    
    # Check for NaN values
    if np.isnan(X_train).any() or np.isnan(X_test).any():
        print("⚠️ Warning: NaN values detected in features. Consider preprocessing.")
    
    print(f"✅ Data validation passed")
    print(f"   Training samples: {len(X_train)}")
    print(f"   Test samples: {len(X_test)}")
    print(f"   Features: {X_train.shape[1]}")
    print(f"   Classes: {len(np.unique(y_train))}")

def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # MB

def logistic_regression_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Logistic Regression - Linear classifier, fast training
    Best for: Linearly separable data, baseline model
    """
    print("🔵 Training Logistic Regression...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Scale features for better performance
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'C': [0.1, 1.0, 10.0], 'solver': ['liblinear', 'lbfgs']}
            model = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = LogisticRegression(random_state=42, max_iter=1000, C=1.0)
        
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, scaler
        
    except Exception as e:
        print(f"❌ Error in Logistic Regression: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def random_forest_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Random Forest - Ensemble of decision trees
    Best for: Non-linear patterns, feature importance, robust to overfitting
    """
    print("🌲 Training Random Forest...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'n_estimators': [50, 100], 'max_depth': [10, 20, None]}
            model = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=1), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)
        
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, None
        
    except Exception as e:
        print(f"❌ Error in Random Forest: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def svm_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Support Vector Machine - Finds optimal decision boundary
    Best for: High-dimensional data, complex decision boundaries
    """
    print("⚡ Training SVM...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Scale features (important for SVM)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear']}
            model = GridSearchCV(SVC(random_state=42, probability=True), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = SVC(kernel='rbf', random_state=42, probability=True)
        
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, scaler
        
    except Exception as e:
        print(f"❌ Error in SVM: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def knn_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    K-Nearest Neighbors - Instance-based learning
    Best for: Local patterns, simple implementation
    """
    print("🎯 Training K-Nearest Neighbors...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Use context manager to suppress warnings during KNN training
        with suppress_blas_warnings():
            # Force single-threaded execution for KNN
            
            if tune_hyperparams:
                param_grid = {'n_neighbors': [3, 5, 7], 'algorithm': ['ball_tree', 'kd_tree']}
                model = GridSearchCV(
                    KNeighborsClassifier(n_jobs=1), 
                    param_grid, 
                    cv=3, 
                    n_jobs=1,
                    verbose=0
                )
                
            else:
                # Use ball_tree algorithm to avoid OpenBLAS issues
                model = KNeighborsClassifier(
                    n_neighbors=5, 
                    n_jobs=1,
                    algorithm='ball_tree'  # This avoids BLAS operations
                )
            
            # Train with warning suppression
            model.fit(X_train_scaled, y_train)
            
            # Predictions with warning suppression
            y_pred = model.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, scaler
        
    except Exception as e:
        print(f"❌ Error in KNN: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def gradient_boosting_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Gradient Boosting - Sequential ensemble learning
    Best for: Complex patterns, high accuracy, feature interactions
    """
    print("🚀 Training Gradient Boosting...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01]}
            model = GridSearchCV(GradientBoostingClassifier(random_state=42), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, None
        
    except Exception as e:
        print(f"❌ Error in Gradient Boosting: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def naive_bayes_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Naive Bayes - Probabilistic classifier
    Best for: Fast training, text classification, baseline
    """
    print("📊 Training Naive Bayes...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7]}
            model = GridSearchCV(GaussianNB(), param_grid, cv=3, n_jobs=1)
        else:
            model = GaussianNB()
        
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, None
        
    except Exception as e:
        print(f"❌ Error in Naive Bayes: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def decision_tree_model(X_train, X_test, y_train, y_test, tune_hyperparams=False):
    """
    Decision Tree - Tree-based classifier
    Best for: Interpretable decisions, feature selection
    """
    print("🌳 Training Decision Tree...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
            model = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = DecisionTreeClassifier(random_state=42, max_depth=10)
        
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, None
        
    except Exception as e:
        print(f"❌ Error in Decision Tree: {str(e)}")
        return None, None, 0, 0, 0, 0, None
    """
    Multi-Layer Perceptron - Neural network classifier
    Best for: Complex non-linear patterns, large datasets
    """
    print("🧠 Training Neural Network...")
    start_time = time.time()
    start_memory = get_memory_usage()
    
    try:
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Hyperparameter tuning
        if tune_hyperparams:
            param_grid = {'hidden_layer_sizes': [(50,), (100,), (100, 50)], 
                         'learning_rate_init': [0.01, 0.001]}
            model = GridSearchCV(MLPClassifier(max_iter=500, random_state=42), 
                               param_grid, cv=3, n_jobs=1)
        else:
            model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
        
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        training_time = time.time() - start_time
        memory_used = get_memory_usage() - start_memory
        
        return model, y_pred, accuracy, f1, training_time, memory_used, scaler
        
    except Exception as e:
        print(f"❌ Error in Neural Network: {str(e)}")
        return None, None, 0, 0, 0, 0, None

def save_model(model, scaler, model_name, save_dir="models"):
    """Save trained model and scaler"""
    os.makedirs(save_dir, exist_ok=True)
    
    model_path = os.path.join(save_dir, f"{model_name.replace(' ', '_')}_model.pkl")
    joblib.dump(model, model_path)
    
    if scaler:
        scaler_path = os.path.join(save_dir, f"{model_name.replace(' ', '_')}_scaler.pkl")
        joblib.dump(scaler, scaler_path)
    
    print(f"💾 Model saved: {model_path}")

def load_model(model_name, save_dir="models"):
    """Load trained model and scaler"""
    model_path = os.path.join(save_dir, f"{model_name.replace(' ', '_')}_model.pkl")
    scaler_path = os.path.join(save_dir, f"{model_name.replace(' ', '_')}_scaler.pkl")
    
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path) if os.path.exists(scaler_path) else None
    
    print(f"📂 Model loaded: {model_path}")
    return model, scaler

def plot_feature_importance(model, model_name, feature_names=None):
    """Plot feature importance for tree-based models"""
    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(10, 6))
        
        if hasattr(model, 'best_estimator_'):
            importances = model.best_estimator_.feature_importances_
        else:
            importances = model.feature_importances_
        
        # Get top 20 features
        indices = np.argsort(importances)[::-1][:20]
        
        plt.bar(range(len(indices)), importances[indices])
        plt.title(f'Top 20 Feature Importance - {model_name}')
        plt.xlabel('Feature Index')
        plt.ylabel('Importance')
        plt.xticks(range(len(indices)), indices)
        plt.tight_layout()
        plt.show()

def enhanced_performance_summary(results):
    """Enhanced performance summary with multiple metrics"""
    print("\n📊 ENHANCED PERFORMANCE SUMMARY")
    print("="*60)
    
    # Create DataFrame for better formatting
    import pandas as pd
    
    df_data = []
    for model_name, result in results.items():
        if result['accuracy'] > 0:  # Only include successful models
            df_data.append({
                'Model': model_name,
                'Accuracy': f"{result['accuracy']:.4f}",
                'F1-Score': f"{result['f1']:.4f}",
                'Time (s)': f"{result['time']:.2f}",
                'Memory (MB)': f"{result['memory']:.1f}"
            })
    
    df = pd.DataFrame(df_data)
    df = df.sort_values('Accuracy', ascending=False)
    print(df.to_string(index=False))
    
    return df

def completely_disable_blas_warnings():
    """Nuclear option: Completely disable all BLAS-related warnings"""
    # Redirect stderr to devnull during problematic operations
    import sys
    from contextlib import redirect_stderr
    from io import StringIO
    
    # Set additional environment variables
    os.environ['PYTHONHASHSEED'] = '0'
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    # Suppress all warnings
    import warnings
    warnings.filterwarnings("ignore")
    warnings.simplefilter("ignore")
    
    # Monkey patch numpy's warning system
    try:
        import numpy as np
        np.seterr(all='ignore')
    except:
        pass
    
    print("🔇 All BLAS warnings completely disabled")

def main_evaluation(tune_hyperparams=False, save_models=False):
    """Main function to train and evaluate all models"""
    print("🎯 ENHANCED ML MODEL EVALUATION")
    print("="*50)
    
    # Completely disable warnings first
    completely_disable_blas_warnings()
    
    # Validate input data
    try:
        validate_data(X_train, X_test, y_train, y_test)
    except NameError:
        print("❌ Please ensure X_train, X_test, y_train, y_test and label_encoder are defined")
        return None, None
    
    # Get class names
    class_names = label_encoder.classes_
    
    # Dictionary to store all models
    models_data = {}
    results = {}
    
    # Define ALL models to train
    model_functions = {
        'Logistic Regression': logistic_regression_model,
        'Random Forest': random_forest_model,
        'SVM': svm_model,
        'K-Nearest Neighbors': knn_model,
        # 'Gradient Boosting': gradient_boosting_model,
        'Naive Bayes': naive_bayes_model,
        'Decision Tree': decision_tree_model,
        'Neural Network': neural_network_model
    }
    
    # Train all models
    for model_name, model_func in model_functions.items():
        print(f"\n{'='*20} {model_name} {'='*20}")
        
        # Train model
        model, y_pred, accuracy, f1, training_time, memory_used, scaler = model_func(
            X_train, X_test, y_train, y_test, tune_hyperparams
        )
        
        if model is not None:
            # Store results
            models_data[model_name] = (model, scaler)
            results[model_name] = {
                'accuracy': accuracy,
                'f1': f1,
                'time': training_time,
                'memory': memory_used,
                'predictions': y_pred
            }
            
            # Print results
            print(f"✅ Accuracy: {accuracy:.4f}")
            print(f"✅ F1-Score: {f1:.4f}")
            print(f"⏱️ Training Time: {training_time:.2f} seconds")
            print(f"💾 Memory Used: {memory_used:.1f} MB")
            
            # Save model if requested
            if save_models:
                save_model(model, scaler, model_name)
            
            # Plot feature importance for applicable models
            if model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree']:
                plot_feature_importance(model, model_name)
    
    # Enhanced performance summary
    enhanced_performance_summary(results)
    
    return models_data, results

# Example usage with complete warning suppression
if __name__ == "__main__":
    # Disable warnings before running
    completely_disable_blas_warnings()
    
    # Run with hyperparameter tuning and model saving
    print("🚀 Starting model evaluation with all warnings suppressed...")
    models_data, results = main_evaluation(tune_hyperparams=True, save_models=True)

🎯 ENHANCED ML MODEL EVALUATION
🔍 Validating data...
✅ Data validation passed
   Training samples: 640
   Test samples: 160
   Features: 25088
   Classes: 4

🔵 Training Logistic Regression...


# Older code