#### 25/50/75% Data splits (stratified)

#### 25%

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import cv2
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms

# Custom Dataset for loading images and extracting patches based on nuclear density
class HistologyDataset(Dataset):
    def __init__(self, image_dir, csv_path, transform=None, patch_size=299, overlap=0.5, threshold=1.587, min_blue_density=0.02):
        self.image_dir = image_dir
        self.transform = transform
        self.patch_size = patch_size
        self.overlap = overlap
        self.threshold = threshold
        self.min_blue_density = min_blue_density

        self.data = pd.read_csv(csv_path)
        self.image_paths = [os.path.join(image_dir, fname) for fname in self.data.iloc[:, 0]]
        self.labels = self.data.iloc[:, 1].values
        self.label_to_idx = {label: idx for idx, label in enumerate(np.unique(self.labels))}
        self.labels = np.array([self.label_to_idx[label] for label in self.labels], dtype=np.int64)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)
        patches, patch_labels = self.extract_patches(image, label)
        if self.transform:
            patches = [self.transform(patch) for patch in patches]
        return patches, patch_labels

    def extract_patches(self, image, label):
        patches = []
        patch_labels = []
        height, width, _ = image.shape
        stride = int(self.patch_size * (1 - self.overlap))
        for y in range(0, height - self.patch_size + 1, stride):
            for x in range(0, width - self.patch_size + 1, stride):
                patch = image[y:y+self.patch_size, x:x+self.patch_size]
                if self.is_nucleus_dense(patch):
                    patches.append(patch)
                    patch_labels.append(label)
        return patches, patch_labels

    def is_nucleus_dense(self, patch):
        hsv_patch = cv2.cvtColor(patch, cv2.COLOR_RGB2HSV)
        blue_mask = (hsv_patch[:, :, 0] > self.threshold).astype(np.uint8)
        blue_density = np.sum(blue_mask) / (patch.shape[0] * patch.shape[1])
        return blue_density > self.min_blue_density

def create_balanced_subset(dataset, percentage):
    class_counts = np.bincount(dataset.labels)
    min_class_count = min(class_counts)
    subset_size_per_class = int(min_class_count * percentage)
    indices_per_class = []
    for class_label in range(len(class_counts)):
        class_indices = np.where(dataset.labels == class_label)[0]
        selected_indices = np.random.choice(class_indices, subset_size_per_class, replace=False)
        indices_per_class.extend(selected_indices)
    return indices_per_class

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(180),
    transforms.ToTensor()
])

class ModifiedInceptionV3(nn.Module):
    def __init__(self, num_classes=4):
        super(ModifiedInceptionV3, self).__init__()
        self.inception = models.inception_v3(pretrained=True)
        self.inception.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.inception.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.inception.fc.in_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, num_classes)
        )

    def forward(self, x):
        outputs = self.inception(x)
        if isinstance(outputs, tuple):
            outputs = outputs[0]
        return outputs

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for patches, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            patches = torch.cat([patch.to(device) for patch in patches], dim=0)
            if isinstance(labels, list) or isinstance(labels, tuple):
                labels = torch.cat(labels, dim=0).to(device)
            labels = labels.long()
            optimizer.zero_grad()
            outputs = model(patches)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_acc = 100 * correct / total
        print(f"Train Loss: {running_loss / len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for patches, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                patches = torch.cat([patch.to(device) for patch in patches], dim=0)
                if isinstance(labels, list) or isinstance(labels, tuple):
                    labels = torch.cat(labels, dim=0).to(device)
                labels = labels.long()
                outputs = model(patches)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        val_acc = 100 * correct / total
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_acc:.2f}%")

def majority_voting(patch_predictions):
    values, counts = np.unique(patch_predictions, return_counts=True)
    majority_class = values[np.argmax(counts)]
    return majority_class

image_dir = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\images"
csv_path = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\microscopy_ground_truth.csv"

dataset = HistologyDataset(image_dir=image_dir, csv_path=csv_path, transform=transform)
indices_25_percent = create_balanced_subset(dataset, percentage=0.25)
subset_25_percent = torch.utils.data.Subset(dataset, indices_25_percent)

train_size_25 = int(0.75 * len(subset_25_percent))
val_size_25 = len(subset_25_percent) - train_size_25
train_dataset_25, val_dataset_25 = random_split(subset_25_percent, [train_size_25, val_size_25])

train_loader_25 = DataLoader(train_dataset_25, batch_size=4, shuffle=True)
val_loader_25 = DataLoader(val_dataset_25, batch_size=4, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ModifiedInceptionV3(num_classes=4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

train_model(model, train_loader_25, val_loader_25, criterion, optimizer, num_epochs=50)


#### 50%

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import cv2
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms

# Custom Dataset for loading images and extracting patches based on nuclear density
class HistologyDataset(Dataset):
    def __init__(self, image_dir, csv_path, transform=None, patch_size=299, overlap=0.5, threshold=1.587, min_blue_density=0.02):
        self.image_dir = image_dir
        self.transform = transform
        self.patch_size = patch_size
        self.overlap = overlap
        self.threshold = threshold
        self.min_blue_density = min_blue_density

        self.data = pd.read_csv(csv_path)
        self.image_paths = [os.path.join(image_dir, fname) for fname in self.data.iloc[:, 0]]
        self.labels = self.data.iloc[:, 1].values
        self.label_to_idx = {label: idx for idx, label in enumerate(np.unique(self.labels))}
        self.labels = np.array([self.label_to_idx[label] for label in self.labels], dtype=np.int64)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)
        patches, patch_labels = self.extract_patches(image, label)
        if self.transform:
            patches = [self.transform(patch) for patch in patches]
        return patches, patch_labels

    def extract_patches(self, image, label):
        patches = []
        patch_labels = []
        height, width, _ = image.shape
        stride = int(self.patch_size * (1 - self.overlap))
        for y in range(0, height - self.patch_size + 1, stride):
            for x in range(0, width - self.patch_size + 1, stride):
                patch = image[y:y+self.patch_size, x:x+self.patch_size]
                if self.is_nucleus_dense(patch):
                    patches.append(patch)
                    patch_labels.append(label)
        return patches, patch_labels

    def is_nucleus_dense(self, patch):
        hsv_patch = cv2.cvtColor(patch, cv2.COLOR_RGB2HSV)
        blue_mask = (hsv_patch[:, :, 0] > self.threshold).astype(np.uint8)
        blue_density = np.sum(blue_mask) / (patch.shape[0] * patch.shape[1])
        return blue_density > self.min_blue_density

def create_balanced_subset(dataset, percentage):
    class_counts = np.bincount(dataset.labels)
    min_class_count = min(class_counts)
    subset_size_per_class = int(min_class_count * percentage)
    indices_per_class = []
    for class_label in range(len(class_counts)):
        class_indices = np.where(dataset.labels == class_label)[0]
        selected_indices = np.random.choice(class_indices, subset_size_per_class, replace=False)
        indices_per_class.extend(selected_indices)
    return indices_per_class

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(180),
    transforms.ToTensor()
])

class ModifiedInceptionV3(nn.Module):
    def __init__(self, num_classes=4):
        super(ModifiedInceptionV3, self).__init__()
        self.inception = models.inception_v3(pretrained=True)
        self.inception.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.inception.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.inception.fc.in_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, num_classes)
        )

    def forward(self, x):
        outputs = self.inception(x)
        if isinstance(outputs, tuple):
            outputs = outputs[0]
        return outputs

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for patches, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            patches = torch.cat([patch.to(device) for patch in patches], dim=0)
            if isinstance(labels, list) or isinstance(labels, tuple):
                labels = torch.cat(labels, dim=0).to(device)
            labels = labels.long()
            optimizer.zero_grad()
            outputs = model(patches)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_acc = 100 * correct / total
        print(f"Train Loss: {running_loss / len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for patches, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                patches = torch.cat([patch.to(device) for patch in patches], dim=0)
                if isinstance(labels, list) or isinstance(labels, tuple):
                    labels = torch.cat(labels, dim=0).to(device)
                labels = labels.long()
                outputs = model(patches)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        val_acc = 100 * correct / total
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_acc:.2f}%")

def majority_voting(patch_predictions):
    values, counts = np.unique(patch_predictions, return_counts=True)
    majority_class = values[np.argmax(counts)]
    return majority_class

image_dir = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\images"
csv_path = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\microscopy_ground_truth.csv"

dataset = HistologyDataset(image_dir=image_dir, csv_path=csv_path, transform=transform)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ModifiedInceptionV3(num_classes=4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

indices_50_percent = create_balanced_subset(dataset, percentage=0.50)
subset_50_percent = torch.utils.data.Subset(dataset, indices_50_percent)

train_size_50 = int(0.75 * len(subset_50_percent))
val_size_50 = len(subset_50_percent) - train_size_50
train_dataset_50, val_dataset_50 = random_split(subset_50_percent, [train_size_50, val_size_50])

train_loader_50 = DataLoader(train_dataset_50, batch_size=4, shuffle=True)
val_loader_50 = DataLoader(val_dataset_50, batch_size=4, shuffle=False)

train_model(model, train_loader_50, val_loader_50, criterion, optimizer, num_epochs=50)


#### 75%

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import cv2
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms

# Custom Dataset for loading images and extracting patches based on nuclear density
class HistologyDataset(Dataset):
    def __init__(self, image_dir, csv_path, transform=None, patch_size=299, overlap=0.5, threshold=1.587, min_blue_density=0.02):
        self.image_dir = image_dir
        self.transform = transform
        self.patch_size = patch_size
        self.overlap = overlap
        self.threshold = threshold
        self.min_blue_density = min_blue_density

        self.data = pd.read_csv(csv_path)
        self.image_paths = [os.path.join(image_dir, fname) for fname in self.data.iloc[:, 0]]
        self.labels = self.data.iloc[:, 1].values
        self.label_to_idx = {label: idx for idx, label in enumerate(np.unique(self.labels))}
        self.labels = np.array([self.label_to_idx[label] for label in self.labels], dtype=np.int64)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)
        patches, patch_labels = self.extract_patches(image, label)
        if self.transform:
            patches = [self.transform(patch) for patch in patches]
        return patches, patch_labels

    def extract_patches(self, image, label):
        patches = []
        patch_labels = []
        height, width, _ = image.shape
        stride = int(self.patch_size * (1 - self.overlap))
        for y in range(0, height - self.patch_size + 1, stride):
            for x in range(0, width - self.patch_size + 1, stride):
                patch = image[y:y+self.patch_size, x:x+self.patch_size]
                if self.is_nucleus_dense(patch):
                    patches.append(patch)
                    patch_labels.append(label)
        return patches, patch_labels

    def is_nucleus_dense(self, patch):
        hsv_patch = cv2.cvtColor(patch, cv2.COLOR_RGB2HSV)
        blue_mask = (hsv_patch[:, :, 0] > self.threshold).astype(np.uint8)
        blue_density = np.sum(blue_mask) / (patch.shape[0] * patch.shape[1])
        return blue_density > self.min_blue_density

def create_balanced_subset(dataset, percentage):
    class_counts = np.bincount(dataset.labels)
    min_class_count = min(class_counts)
    subset_size_per_class = int(min_class_count * percentage)
    indices_per_class = []
    for class_label in range(len(class_counts)):
        class_indices = np.where(dataset.labels == class_label)[0]
        selected_indices = np.random.choice(class_indices, subset_size_per_class, replace=False)
        indices_per_class.extend(selected_indices)
    return indices_per_class

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(180),
    transforms.ToTensor()
])

class ModifiedInceptionV3(nn.Module):
    def __init__(self, num_classes=4):
        super(ModifiedInceptionV3, self).__init__()
        self.inception = models.inception_v3(pretrained=True)
        self.inception.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.inception.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.inception.fc.in_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, num_classes)
        )

    def forward(self, x):
        outputs = self.inception(x)
        if isinstance(outputs, tuple):
            outputs = outputs[0]
        return outputs

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for patches, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            patches = torch.cat([patch.to(device) for patch in patches], dim=0)
            if isinstance(labels, list) or isinstance(labels, tuple):
                labels = torch.cat(labels, dim=0).to(device)
            labels = labels.long()
            optimizer.zero_grad()
            outputs = model(patches)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_acc = 100 * correct / total
        print(f"Train Loss: {running_loss / len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for patches, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                patches = torch.cat([patch.to(device) for patch in patches], dim=0)
                if isinstance(labels, list) or isinstance(labels, tuple):
                    labels = torch.cat(labels, dim=0).to(device)
                labels = labels.long()
                outputs = model(patches)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        val_acc = 100 * correct / total
        print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_acc:.2f}%")

def majority_voting(patch_predictions):
    values, counts = np.unique(patch_predictions, return_counts=True)
    majority_class = values[np.argmax(counts)]
    return majority_class

image_dir = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\images"
csv_path = r"C:\Users\vamsv\Downloads\ICIAR2018_BACH_Challenge\ICIAR2018_BACH_Challenge\Photos\microscopy_ground_truth.csv"

dataset = HistologyDataset(image_dir=image_dir, csv_path=csv_path, transform=transform)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ModifiedInceptionV3(num_classes=4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

indices_75_percent = create_balanced_subset(dataset, percentage=0.75)
subset_75_percent = torch.utils.data.Subset(dataset, indices_75_percent)

train_size_75 = int(0.75 * len(subset_75_percent))
val_size_75 = len(subset_75_percent) - train_size_75
train_dataset_75, val_dataset_75 = random_split(subset_75_percent, [train_size_75, val_size_75])

train_loader_75 = DataLoader(train_dataset_75, batch_size=4, shuffle=True)
val_loader_75 = DataLoader(val_dataset_75, batch_size=4, shuffle=False)

train_model(model, train_loader_75, val_loader_75, criterion, optimizer, num_epochs=50)

