In [1]:
import torch
import os
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from PIL import Image
from torchvision import transforms
from sklearn.model_selection import train_test_split

In [2]:
def get_shuffled_images(data_dir):
    """
    Reads images from the specified directory, shuffles them, and returns a list of tuples,
    where each tuple contains an image array and its corresponding label.

    Args:
        data_dir (str): The directory containing the image data.

    Returns:
        list: A list of tuples, where each tuple contains an image array and its corresponding label.
    """
    classes = os.listdir(data_dir)[1:]
    data = []
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        class_label = 1 if class_name == 'Malignant' else 0
        for img_name in os.listdir(class_dir):
            image_path = os.path.join(class_dir, img_name)
            image = np.array(Image.open(image_path))
            label = class_label
            data.append((image, label))
    np.random.shuffle(data)
    return data

In [3]:
data_train_dir = "../data/train"  # Directory containing the training data
data_test_dir = "../data/test"    # Directory containing the testing data

In [4]:
train_data = get_shuffled_images(data_train_dir)  # Retrieve shuffled image data for training from the specified directory
test_data = get_shuffled_images(data_test_dir)    # Retrieve shuffled image data for testing from the specified directory

In [5]:
# Iterate over the labels of the first 15 images in the training data and print them
for image, label in train_data[:15]:
    print(f"label: {label}")

label: 0
label: 0
label: 1
label: 1
label: 1
label: 0
label: 0
label: 0
label: 0
label: 0
label: 0
label: 1
label: 1
label: 0
label: 0


In [6]:
# Define a transformation pipeline to convert images to tensors and normalize them
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [7]:
transformed_train_data = [(transform(image), torch.tensor(label)) for image, label in train_data]  # Apply transformation to training data
transformed_test_data = [(transform(image), torch.tensor(label)) for image, label in test_data]    # Apply transformation to testing data

In [8]:
train_images = torch.stack([item[0] for item in transformed_train_data])  # Stack transformed training images into a single tensor
train_labels = torch.stack([item[1] for item in transformed_train_data])  # Stack transformed training labels into a single tensor
test_images_tensor = torch.stack([item[0] for item in transformed_test_data])  # Stack transformed testing images into a single tensor
test_labels_tensor = torch.stack([item[1] for item in transformed_test_data])  # Stack transformed testing labels into a single tensor

In [9]:
train_val_images_numpy = train_images.numpy()  # Convert training images tensor to NumPy array
train_val_labels_numpy = train_labels.numpy()  # Convert training labels tensor to NumPy array

# Split the training data into training and validation sets
train_images_numpy, val_images_numpy, train_labels_numpy, val_labels_numpy = train_test_split(
    train_val_images_numpy,
    train_val_labels_numpy,
    test_size=0.20,
    random_state=42,
    stratify=train_val_labels_numpy  # Maintain class balance during splitting
)

In [10]:
train_images_tensor = torch.tensor(train_images_numpy)  # Convert training images NumPy array to tensor
train_labels_tensor = torch.tensor(train_labels_numpy)  # Convert training labels NumPy array to tensor
val_images_tensor = torch.tensor(val_images_numpy)      # Convert validation images NumPy array to tensor
val_labels_tensor = torch.tensor(val_labels_numpy)      # Convert validation labels NumPy array to tensor

In [11]:
train_dataset = TensorDataset(train_images_tensor, train_labels_tensor)  # Create TensorDataset for training data
val_dataset = TensorDataset(val_images_tensor, val_labels_tensor)        # Create TensorDataset for validation data
test_dataset = TensorDataset(test_images_tensor, test_labels_tensor)      # Create TensorDataset for testing data

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Create DataLoader for training data
val_loader = DataLoader(val_dataset, batch_size=batch_size)                      # Create DataLoader for validation data
test_loader = DataLoader(test_dataset, batch_size=batch_size)                    # Create DataLoader for testing data


In [12]:
# Save the training, validation, and testing data loaders to files
torch.save(train_loader, "../data/train_loader.pt")
torch.save(val_loader, "../data/val_loader.pt")
torch.save(test_loader, "../data/test_loader.pt")