In [1]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Import necessary libraries
import os
import cv2  # OpenCV for image processing
import numpy as np
from tqdm import tqdm  # For progress bars

# Define the path to the already downloaded dataset
DATASET_PATH = 'C:/Users/User/Downloads/archive'  # Adjust to your local dataset path

# Define categories based on the dataset
CATEGORIES = ['glioma_tumor', 'no_tumor', 'meningioma_tumor', 'pituitary_tumor']

# Define target image size (matching the main notebook)
IMAGE_SIZE = (256, 256)

# Function to load images from a directory (Training or Testing) with progress bar
def load_images_from_folder(base_dir, categories, subfolder):
    images = []
    labels = []
    for category in categories:
        folder = os.path.join(base_dir, subfolder, category)
        print(f"Loading {category} ({subfolder}):")
        if not os.path.exists(folder):
            print(f"Error: Directory {folder} does not exist.")
            continue
        for filename in tqdm(os.listdir(folder)):
            if filename.endswith(('.jpg', '.jpeg', '.png')):  # Check for image files
                img_path = os.path.join(folder, filename)
                img = cv2.imread(img_path)  # Load image using OpenCV
                if img is not None:
                    # Resize image to ensure consistent shape (256x256x3)
                    img = cv2.resize(img, IMAGE_SIZE)
                    images.append(img)
                    labels.append(category)
                else:
                    print(f"Warning: Could not load image {img_path}")
    return images, labels

# Preprocessing function with filtering
def preprocess_image(img, target_size=(256, 256), apply_filter=True):
    # Image is already resized to target_size during loading, but ensure consistency
    img = cv2.resize(img, target_size)
    
    # Apply median filter for noise reduction (on color image)
    if apply_filter:
        img = cv2.medianBlur(img, 3)  # Kernel size 3 for mild filtering; use 5 for stronger
    
    # Normalize pixel values to [0, 1]
    img = img.astype('float32') / 255.0
    
    return img

# Load all data (combining Training and Testing as in the main notebook)
print("Loading all data...")
X = []
y = []

# Load from Training
train_images, train_labels = load_images_from_folder(DATASET_PATH, CATEGORIES, 'Training')
X.extend(train_images)
y.extend(train_labels)

# Load from Testing
test_images, test_labels = load_images_from_folder(DATASET_PATH, CATEGORIES, 'Testing')
X.extend(test_images)
y.extend(test_labels)

# Convert to NumPy arrays after ensuring consistent shapes
print("Converting to NumPy arrays...")
X = np.array(X)
y = np.array(y)

# Verify shapes before preprocessing
print(f"Raw data shape: {X.shape}, Labels shape: {y.shape}")

# Apply preprocessing and filtering to all images
print("Preprocessing all images...")
preprocessed_X = np.array([preprocess_image(img) for img in tqdm(X)])

# Optional: Balance classes by undersampling majority classes
def balance_classes(images, labels, target_size=500):  # Target size ~min class size (no_tumor)
    unique_labels = np.unique(labels)
    balanced_images = []
    balanced_labels = []
    for lbl in unique_labels:
        idx = np.where(labels == lbl)[0]
        if len(idx) > target_size:
            # Randomly sample to target_size
            sampled_idx = np.random.choice(idx, target_size, replace=False)
        else:
            sampled_idx = idx
        balanced_images.extend(images[sampled_idx])
        balanced_labels.extend(labels[sampled_idx])
    return np.array(balanced_images), np.array(balanced_labels)

# Uncomment the following line to apply class balancing if needed
# preprocessed_X, y = balance_classes(preprocessed_X, y, target_size=500)

# Print shapes to verify
print(f"Preprocessed data shape: {preprocessed_X.shape}, Labels shape: {y.shape}")

# Optional: Save preprocessed data to avoid reloading
np.save('preprocessed_X.npy', preprocessed_X)
np.save('y.npy', y)

print("Preprocessing complete. Data is ready for model training.")

Loading all data...
Loading glioma_tumor (Training):


100%|███████████████████████████████████████████████████████████████████████████████| 826/826 [00:00<00:00, 880.50it/s]


Loading no_tumor (Training):


100%|███████████████████████████████████████████████████████████████████████████████| 395/395 [00:00<00:00, 714.56it/s]


Loading meningioma_tumor (Training):


100%|███████████████████████████████████████████████████████████████████████████████| 822/822 [00:00<00:00, 873.82it/s]


Loading pituitary_tumor (Training):


100%|███████████████████████████████████████████████████████████████████████████████| 827/827 [00:01<00:00, 795.70it/s]


Loading glioma_tumor (Testing):


100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 769.50it/s]


Loading no_tumor (Testing):


100%|██████████████████████████████████████████████████████████████████████████████| 105/105 [00:00<00:00, 1394.10it/s]


Loading meningioma_tumor (Testing):


100%|███████████████████████████████████████████████████████████████████████████████| 115/115 [00:00<00:00, 934.80it/s]


Loading pituitary_tumor (Testing):


100%|█████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 323.00it/s]


Converting to NumPy arrays...
Raw data shape: (3264, 256, 256, 3), Labels shape: (3264,)
Preprocessing all images...


100%|████████████████████████████████████████████████████████████████████████████| 3264/3264 [00:01<00:00, 1930.33it/s]


Preprocessed data shape: (3264, 256, 256, 3), Labels shape: (3264,)
Preprocessing complete. Data is ready for model training.
