# Data normalization and feature extraction

In [None]:
import os
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

def load_and_normalize_dataset(base_dir):
    normalized_data = []
    
    # Class mapping
    classes = {
        'Mild_Demented': 0,
        'Moderate_Demented': 1,
        'Non_Demented': 2,
        'Very_Mild_Demented': 3
    }
    
    # Process train, test, and val folders
    for split in ['train', 'test', 'val']:
        split_dir = os.path.join(base_dir, split)
        
        if not os.path.exists(split_dir):
            print(f"Warning: {split_dir} does not exist, skipping...")
            continue
        
        # Process each class folder
        for class_name, label in classes.items():
            class_dir = os.path.join(split_dir, class_name)
            
            if not os.path.exists(class_dir):
                continue
                
            # Process each image in the class folder
            for img_file in os.listdir(class_dir):
                if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(class_dir, img_file)
                    
                    try:
                        # Load image
                        img = Image.open(img_path).convert('L')
                        img_array = np.array(img) / 255.0  # Normalize to [0,1]
                        
                        # Process and store
                        item = {
                            'original_path': img_path,
                            'label': label,
                            'class_name': class_name,
                            'split': split,
                            'image': img_array,
                            'dataset': 'folder_dataset' if 'folder' in img_file else 'parquet_dataset'
                        }
                        
                        # Apply normalizations
                        normalized_item = normalize_mri_for_ventricles(item, contours=True) # added parameter to inclue isolated ventricles
                        normalized_data.append(normalized_item)
                        
                    except Exception as e:
                        print(f"Error processing {img_path}: {e}")
    
    # Split back into train, test, and val
    train_data = [item for item in normalized_data if item['split'] == 'train']
    test_data = [item for item in normalized_data if item['split'] == 'test']
    val_data = [item for item in normalized_data if item['split'] == 'val']
    
    return train_data, test_data, val_data



In [None]:
def normalize_mri_for_ventricles(item, contours=False): # added parameter contours
    """
    Feature extraction : Ventricles 
    """
    image = item['image']
    
    # 1. Intensity normalization (robust)
    brain_mask = image > 0.05
    brain_pixels = image[brain_mask]
    
    if len(brain_pixels) > 0:
        p2, p98 = np.percentile(brain_pixels, [2, 98])
        normalized = np.clip(image, p2, p98)
        normalized = (normalized - p2) / (p98 - p2)
        item['image_normalized'] = normalized
    else:
        item['image_normalized'] = image
    
    # 2. Ventricle enhancement
    
    # Create version optimized for dark ventricle regions
    img_uint8 = (item['image_normalized'] * 255).astype(np.uint8)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(img_uint8)
    item['image_enhanced'] = enhanced / 255.0
    
    # Create inverted version to highlight ventricles
    inverted = 1 - item['image_normalized']
    # Apply adaptive thresholding to highlight ventricle regions
    item['image_ventricle_focus'] = inverted
    
    # 3. Ventricle segmentation (rough approximation)
    # Threshold to isolate ventricles (dark regions)
    _, threshold = cv2.threshold(
        img_uint8, 
        int(np.mean(img_uint8) * 0.5), 
        255, 
        cv2.THRESH_BINARY_INV
    )
    
    # Clean up mask
    kernel = np.ones((3, 3), np.uint8)
    cleaned = cv2.morphologyEx(threshold, cv2.MORPH_OPEN, kernel) # i changed the variables to make it more clear
    cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel) # cleaned is the ventricle mask 
    
    item['ventricle_mask'] = cleaned / 255.0
        
    if contours:
    # Suggestion: Extract only the ventricles using contors 
        contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        contours = sorted(contours,key=cv2.contourArea,reverse=True)[:2] # Only keeping the two assuming the two largest are the ventricles need to double check 
        enhanced_rgb = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
        contour_img = enhanced_rgb.copy()
        cv2.drawContours(contour_img, contours, -1, (0, 255, 0), 2)
        item['contour_img'] = contour_img
    # Now get the venctricles, drawing the contours on a blank mask and then applying the mask to the image
    # Everything should be black except the ventricles
        blank_mask = np.zeros_like(enhanced, dtype=np.uint8)
        cv2.drawContours(blank_mask,contours,-1,255,thickness=cv2.FILLED)
        ventricles = cv2.bitwise_and(enhanced, enhanced, mask=blank_mask) 
        item['ventricles'] = ventricles / 255.0 if ventricles.max() > 1 else ventricles
        
    return item 
        


i haven't tested the code yet and haven't added it to the visualization part yet, we should though to see if the contours correctly maps out the ventricles, aka visualize contour_img

In [3]:
def visualize_normalizations(data, num_samples=4):
    """
    Can comment this function out 
    """
    # Select samples from each class if possible
    samples = []
    classes = set(item['class_name'] for item in data)
    
    for class_name in classes:
        class_items = [item for item in data if item['class_name'] == class_name]
        if class_items:
            samples.append(class_items[0])
            if len(samples) >= num_samples:
                break
    
    # Use random samples if we don't have enough
    if len(samples) < num_samples:
        additional = np.random.choice(
            [i for i in data if i not in samples],
            size=min(num_samples - len(samples), len(data) - len(samples)),
            replace=False
        ).tolist()
        samples.extend(additional)
    
    # Create visualization
    fig, axes = plt.subplots(len(samples), 4, figsize=(16, 4 * len(samples)))
    
    for i, item in enumerate(samples):
        # Original
        axes[i, 0].imshow(item['image'], cmap='gray')
        axes[i, 0].set_title(f"{item['class_name']}\nOriginal")
        axes[i, 0].axis('off')
        
        # Normalized
        axes[i, 1].imshow(item['image_normalized'], cmap='gray')
        axes[i, 1].set_title('Normalized')
        axes[i, 1].axis('off')
        
        # Enhanced
        axes[i, 2].imshow(item['image_enhanced'], cmap='gray')
        axes[i, 2].set_title('Enhanced')
        axes[i, 2].axis('off')
        
        # Ventricle Focus (Inverted)
        axes[i, 3].imshow(item['image_ventricle_focus'], cmap='gray')
        axes[i, 3].set_title('Ventricle Focus')
        axes[i, 3].axis('off')
    
    plt.tight_layout()
    return fig



In [9]:
def save_normalized_dataset(base_dir, train_data, test_data, val_data):
    
    # Create output directories
    output_dir = os.path.join(base_dir, 'normalized')
    os.makedirs(output_dir, exist_ok=True)
    
    for split, data in [('train', train_data), ('test', test_data), ('val', val_data)]:
        split_dir = os.path.join(output_dir, split)
        os.makedirs(split_dir, exist_ok=True)
        
        # Create class directories
        class_names = set(item['class_name'] for item in data)
        for class_name in class_names:
            os.makedirs(os.path.join(split_dir, class_name), exist_ok=True)
        
        # Save normalized images
        for item in tqdm(data, desc=f"Saving {split} images"):
            # Generate filename
            original_filename = os.path.basename(item['original_path'])
            base_name = os.path.splitext(original_filename)[0]
            
            # Define paths for different normalizations
            class_dir = os.path.join(split_dir, item['class_name'])
            
            # Save normalized image
            norm_img = (item['image_normalized'] * 255).astype(np.uint8)
            norm_path = os.path.join(class_dir, f"{base_name}_norm.png")
            Image.fromarray(norm_img).save(norm_path)
            
            # Save ventricle focused image (better for ventricle analysis)
            ventricle_img = (item['image_ventricle_focus'] * 255).astype(np.uint8)
            ventricle_path = os.path.join(class_dir, f"{base_name}_ventricle.png")
            Image.fromarray(ventricle_img).save(ventricle_path)

# Main execution
if __name__ == "__main__":
    # Set the base directory to your combined dataset
    base_dir = "Combined_MRI_Dataset"
    
    # Load and normalize the dataset
    print("Loading and normalizing dataset...")
    train_data, test_data, val_data = load_and_normalize_dataset(base_dir)
    
    # Display summary
    print(f"Processed {len(train_data)} training images, {len(test_data)} test images, and {len(val_data)} validation images")
    
    # Visualize normalizations
    print("Generating visualization...")
    fig = visualize_normalizations(train_data + test_data + val_data)
    plt.savefig(os.path.join(base_dir, "normalization_visualization.png"))
    plt.close(fig)
    
    save_normalized_dataset(base_dir, train_data, test_data, val_data)
    
    print("TA-DA!")

Loading and normalizing dataset...
Processed 15360 training images, 3584 test images, and 1152 validation images
Generating visualization...


Saving train images: 100%|██████████████| 15360/15360 [00:13<00:00, 1143.40it/s]
Saving test images: 100%|█████████████████| 3584/3584 [00:03<00:00, 1145.97it/s]
Saving val images: 100%|██████████████████| 1152/1152 [00:01<00:00, 1051.34it/s]

TA-DA!





nextsteps:
feature extraction - ventricles:
plan is to layer these hancrafted features with original image/cnn features to increase accuracy if possible, below are just possible features we can choose from

Size-related features:
    Pixel count (area of ventricles) → Measures how large the ventricles are.
    Perimeter (boundary length) → Tells how irregular the ventricle shape is.
    Major/Minor axis lengths → Captures the elongation of the ventricles.

Shape-related features:
    Eccentricity → Measures how oval the ventricles are.
    Circularity → Helps differentiate between normal and abnormal ventricle shapes.

Texture-related features:
    Entropy → Measures randomness in intensity distribution (higher entropy may indicate pathology).
    Gray-Level Co-occurrence Matrix (GLCM) → Captures patterns of pixel intensities (e.g., roughness).

how about other areas of the scan? hippocampus?
