# Data Preprocessing 

Based on findings from `01_explore_data.ipynb`, implementing comprehensive preprocessing pipeline.

## Preprocessing Steps

1. **Convert pseudo-RGB to grayscale** (automatic with cv2.IMREAD_GRAYSCALE)
2. **Resize to 224Ã—224 pixels** (standardize for CNN input)
3. **Apply CLAHE** (enhance contrast + normalize quality variations)
4. **Remove duplicates** (if found during processing)
5. **Create train/val/test splits** (70/15/15)
6. **Address class imbalance** (augmentation for no_tumor class)

**Note:** Outlier removal skipped - CLAHE normalizes quality variations.

## Convert pseudo-RGB to grayscale

In [21]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
import pandas as pd
import hashlib
from sklearn.model_selection import train_test_split
import json
from tqdm import tqdm  # For progress bars

In [None]:
# Preprocessing configuration
config = {
    'target_size': (224, 224),
    'apply_clahe': True,
    'clahe_clip_limit': 2.0,
    'clahe_tile_size': (8, 8),
    'random_seed': 42
}

# Paths
raw_dataset = Path("../data/brain_tumor_dataset")
processed_dataset = Path("../data/processed")

# Classes
classes = ['glioma', 'meningioma', 'pituitary', 'no_tumor']

print("Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

In [20]:
def preprocess_image(img_path, target_size=(224,224), equalization=True):
    """
    Apply all preprocessing steps to a single image.
    
    Args:
        img_path: Path to input image
        target_size: Target image size (height, width)
        hist_normalization: Whether to apply histogram equalization
        
    Returns:
        Preprocessed grayscale image (numpy array)
    """

    # read image as grayscale
    img = csv.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

    if img is None:
            raise ValueError(f"Failed to read image: {img_path}")
    
    # resize so all images are the same size
    img = csv.resize(img, target_size, interpolation=cv2)

    # apply clahe equalization
    if equalization:
        clahe = cv.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        img = clahe.apply(img)

    return img

    

In [None]:
def preprocess_and_save_dataset(raw_dir, target_dir, config):
    """
    Process entire raw dataset and save to new folder
    
    Args:
        raw_dir: Path to raw dataset
        target_dir: Path to save preprocessed dataset
        config: Dictionary with preprocessing parameters

    Returns:
        Dictionary with processing statistics
    """

    raw_path = Path(raw_dir)
    target_path = Path(target_dir)


In [12]:
def find_duplicates():
    """
    """

In [13]:
def remove_duplicates():
    """
    """

In [None]:
def visualize_preprocessing_effects():
    """
    """