In [45]:
import os
import torch
import torch.nn as nn
from torch import optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.functional import softmax
import PIL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

PIL.ImageFile.LOAD_TRUNCATED_IMAGES = True


# Loading Data

In [2]:
class CancerDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.classes = ["Normal cases", "Benign cases", "Malignant cases"]  # List the directories (class folders)
        self.image_paths = []  # Store the paths of images
        self.labels = []       # Store the labels corresponding to images

        for label, class_name in enumerate(self.classes): # for each class (Normal = 0, Benign = 1, Malignant = 2)
            class_folder = os.path.join(data_dir, class_name)
            if os.path.isdir(class_folder):
                for file_name in os.listdir(class_folder):
                    if file_name.endswith(('.jpg', '.png', '.jpeg')):  # image datatypes
                        self.image_paths.append(os.path.join(class_folder, file_name)) # append images
                        self.labels.append(label) # append associated labels

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load the image
        img_path = self.image_paths[idx]
        image = PIL.Image.open(img_path)

        # Apply any transformations (if defined)
        if self.transform:
            image = self.transform(image)

        # Return the image and the label
        label = self.labels[idx]
        return image, label



In [3]:
img_transform = transforms.Compose([
    transforms.Resize((512, 512)), # resizing. some images are not 512x512
    transforms.Grayscale(),
    transforms.ToTensor()   # convert image to a tensor
])


cancer_dataset = CancerDataset("../data", img_transform)
print(f'Img Shape: {cancer_dataset[0][0].shape}')

print(f'Dataset Size: {len(cancer_dataset)}')
label_map = {0 : "Normal", 1 : "Benign", 2 : "Malignant"}
cts = np.unique(cancer_dataset.labels, return_counts = True)
for i in cts[0]:
    label = label_map[i]
    print(f'{label} Count: {cts[1][i]}, Percent: {100*cts[1][i]/len(cancer_dataset):.2f}')

Img Shape: torch.Size([1, 512, 512])
Dataset Size: 1097
Normal Count: 416, Percent: 37.92
Benign Count: 120, Percent: 10.94
Malignant Count: 561, Percent: 51.14


In [4]:
torch.manual_seed(35)  # set seed

train_proportion = 0.7 # percent training set

train_size = int(train_proportion * len(cancer_dataset)) 
test_size = len(cancer_dataset) - train_size 

train_dataset, test_dataset = random_split(cancer_dataset, [train_size, test_size]) # split dataset into train and test

In [5]:
print(f'Test Set Size: {len(test_dataset.indices)}')
cts = np.unique([cancer_dataset[i][1] for i in test_dataset.indices], return_counts = True)
for i in cts[0]:
    label = label_map[i]
    print(f'Test {label} Count: {cts[1][i]}, Percent: {100*cts[1][i]/len(test_dataset.indices):.2f}')

Test Set Size: 330
Test Normal Count: 130, Percent: 39.39
Test Benign Count: 37, Percent: 11.21
Test Malignant Count: 163, Percent: 49.39


In [6]:
# initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Modeling

In [30]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        #max pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # convolutional layers
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1)        
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)

        self.flatten = nn.Flatten()
        self.linear = nn.Linear(32*64*64, 3)

        
        self.relu = nn.ReLU()
    def forward(self, x):
        # 512 x 512
        x = self.pool(self.relu(self.conv1(x))) # 256x256
        x = self.pool(self.relu(self.conv2(x))) # 128x128
        x = self.pool(self.relu(self.conv3(x))) # 64x64

        x = self.flatten(x)
        x = self.linear(x)
        return x

In [55]:
model = CNN()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

num_epochs = 1
optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()

cpu


In [56]:
for epoch in range(1, num_epochs+1):
    model.train()  # set model to train
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in tqdm(train_dataloader, desc=f"Epoch {epoch}/{num_epochs}"):
        images, labels = images.to(device), labels.to(device).long()

        # zero out gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        #predicted = (outputs > 0.5).float()  # Convert to 0 or 1 based on threshold
        #correct += (predicted.squeeze() == labels).sum().item()
        #total += labels.size(0)
        
    model.eval() # set model to evaluation mode
    
    avg_loss = running_loss / len(train_dataloader)
    #accuracy = 100 * correct / total

    avg_valid_loss = valid_running_loss / len(test_dataloader)
    #valid_accuracy = 100 * valid_correct / valid_total
    print(f"Train Loss: {avg_loss:.4f}") # , Train Accuracy: {accuracy:.2f}%")
    #print(f"Valid Loss: {avg_valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.2f}%")


Epoch 1/10: 100%|███████████████████████████████| 24/24 [00:19<00:00,  1.22it/s]


Train Loss: 2.5566


Epoch 2/10: 100%|███████████████████████████████| 24/24 [00:19<00:00,  1.20it/s]


Train Loss: 1.0282


Epoch 3/10: 100%|███████████████████████████████| 24/24 [00:20<00:00,  1.18it/s]


Train Loss: 0.9910


Epoch 4/10: 100%|███████████████████████████████| 24/24 [00:20<00:00,  1.16it/s]


Train Loss: 0.9702


Epoch 5/10: 100%|███████████████████████████████| 24/24 [00:21<00:00,  1.13it/s]


Train Loss: 0.9606


Epoch 6/10: 100%|███████████████████████████████| 24/24 [00:21<00:00,  1.13it/s]


Train Loss: 0.9548


Epoch 7/10: 100%|███████████████████████████████| 24/24 [00:21<00:00,  1.13it/s]


Train Loss: 0.9520


Epoch 8/10: 100%|███████████████████████████████| 24/24 [00:21<00:00,  1.13it/s]


Train Loss: 0.9509


Epoch 9/10: 100%|███████████████████████████████| 24/24 [00:21<00:00,  1.13it/s]


Train Loss: 0.9500


Epoch 10/10: 100%|██████████████████████████████| 24/24 [00:21<00:00,  1.12it/s]

Train Loss: 0.9495





In [73]:
for images, labels in test_dataloader:
    scores = model(images)
    print(scores)

tensor([[ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
        [ 0.2732, -0.8875,  0.6042],
 