<a href="https://colab.research.google.com/github/vimesh630/ML_CW/blob/main/Preprocessing_for_Faster_R_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [1]:
!pip install opencv-python-headless matplotlib pandas SimpleITK
import os
import cv2
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from xml.dom import minidom
import matplotlib.pyplot as plt
import SimpleITK as sitk
from google.colab import drive
from sklearn.model_selection import train_test_split
import zipfile
import gc

Collecting SimpleITK
  Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.4.1


Mount Google Drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Folder Structure Setup

In [3]:
# Paths
dataset_zip_path = "/content/drive/My Drive/DSGP/DSGP_Dataset.zip"
extracted_dataset_path = "/content/drive/My Drive/DSGP/original_dataset"
base_dir = "/content/drive/My Drive/DSGP/Preprocessed Dataset"

# Unzip dataset if not already extracted
if not os.path.exists(extracted_dataset_path):
    with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_dataset_path)

# Create required folder structure
folders = [
    'Images/Train', 'Images/Val', 'Images/Test',
    'Annotations/Train', 'Annotations/Val', 'Annotations/Test'
]
for folder in folders:
    os.makedirs(os.path.join(base_dir, folder), exist_ok=True)

Annotation Creation

In [4]:
def create_xml(image_path, class_name, box, size):
    root = ET.Element("annotation")
    ET.SubElement(root, "filename").text = os.path.basename(image_path)
    size_elem = ET.SubElement(root, "size")
    ET.SubElement(size_elem, "width").text = str(size[1])
    ET.SubElement(size_elem, "height").text = str(size[0])
    ET.SubElement(size_elem, "depth").text = "3"
    obj = ET.SubElement(root, "object")
    ET.SubElement(obj, "name").text = class_name
    ET.SubElement(obj, "pose").text = "Unspecified"
    bndbox = ET.SubElement(obj, "bndbox")
    ET.SubElement(bndbox, "xmin").text = str(box[0])
    ET.SubElement(bndbox, "ymin").text = str(box[1])
    ET.SubElement(bndbox, "xmax").text = str(box[2])
    ET.SubElement(bndbox, "ymax").text = str(box[3])
    return minidom.parseString(ET.tostring(root)).toprettyxml()

def find_tumor_bbox(img):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest)
        return [x, y, x + w, y + h]
    return [0, 0, 0, 0]


Preprocessing Functions

In [5]:
# Skull Stripping
def skull_stripping(img):
    sitk_img = sitk.GetImageFromArray(img)
    sitk_img = sitk.Cast(sitk_img, sitk.sitkFloat32)
    mask = sitk.OtsuThreshold(sitk_img)
    stripped_img = sitk.GetArrayFromImage(sitk.Mask(sitk_img, mask))
    return cv2.normalize(stripped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

In [6]:
# Normalization
def normalize_image(img):
    p2, p98 = np.percentile(img, (2, 98))
    img = np.clip(img, p2, p98)
    return cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)

In [7]:

def preprocess_pipeline(img_path, target_size=(256, 256)):
    img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)  # Read image without forcing color conversion

    if img is None:
        print(f"Warning: Could not read image {img_path}. Skipping...")
        return None

    if img.dtype == np.float64:
        img = (img * 255).astype(np.uint8)

    if len(img.shape) == 2:  # Convert grayscale to RGB
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    else:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    skull_free = skull_stripping(img)
    normalized = normalize_image(skull_free)

    final_img = (normalized * 255).astype(np.uint8)
    return cv2.resize(final_img, target_size)

Data Augmentation

In [8]:
def augment_image(img):
    h, w = img.shape[:2]
    M = cv2.getRotationMatrix2D((w//2, h//2), np.random.randint(-15, 15), np.random.uniform(0.9, 1.1))
    return cv2.warpAffine(img, M, (w, h))

Preprocessing Function

In [9]:
def process_dataset():
    data = []
    for root, dirs, files in os.walk(extracted_dataset_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                class_label = os.path.basename(root)  # Correctly extract folder name
                class_label = "No_Tumor" if "no tumor" in class_label.lower() else class_label
                data.append({"path": os.path.join(root, file), "class": class_label})

    df = pd.DataFrame(data)

    # Class Balancing
    tumor_df = df[df['class'] != 'No_Tumor']
    no_tumor_df = df[df['class'] == 'No_Tumor']

    min_samples = min(len(tumor_df), len(no_tumor_df))

    tumor_df = tumor_df.sample(min_samples, replace=True)
    no_tumor_df = no_tumor_df.sample(min_samples, replace=True)

    balanced_df = pd.concat([tumor_df, no_tumor_df])

    # Train/Val/Test Split
    train_df, temp_df = train_test_split(balanced_df, test_size=0.3, stratify=balanced_df['class'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['class'])

    # Processing
    for split_df, split_name in zip([train_df, val_df, test_df], ['Train', 'Val', 'Test']):
        for idx, row in split_df.iterrows():
            try:
                processed_img = preprocess_pipeline(row['path'])
                if processed_img is None:
                    continue

                if row['class'] != "No_Tumor" and split_name == "Train":
                    processed_img = augment_image(processed_img)

                img_filename = f"{split_name.lower()}_{idx}.png"
                class_folder = f"{base_dir}/Images/{split_name}/{row['class']}"
                os.makedirs(class_folder, exist_ok=True)
                cv2.imwrite(os.path.join(class_folder, img_filename), cv2.cvtColor(processed_img, cv2.COLOR_RGB2BGR))

                annotation_folder = f"{base_dir}/Annotations/{split_name}/{row['class']}"
                os.makedirs(annotation_folder, exist_ok=True)
                bbox = find_tumor_bbox(processed_img) if row['class'] != "No_Tumor" else [0, 0, 0, 0]
                with open(f"{annotation_folder}/{img_filename.split('.')[0]}.xml", 'w') as f:
                    f.write(create_xml(img_filename, row['class'], bbox, processed_img.shape))

            except Exception as e:
                print(f"Error processing {row['path']}: {e}")

Run Preprocessing

In [10]:
process_dataset()
print("Preprocessing complete! All files saved in:", base_dir)

ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.