In [9]:
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer
import torch
import os
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd

# show all jupyter output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
root = Path('../').resolve()
os.chdir(root)

In [3]:
# Load the model from "create_model_from_pretrained"
model_cfg = 'conch_ViT-B-16'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint_path = 'checkpoints/CONCH/pytorch_model.bin'
checkpoint_path = 'C:\\Users\\Vivian\\Documents\\CONCH\\checkpoints\\conch\\pytorch_model.bin' 
model, preprocess = create_model_from_pretrained(model_cfg, checkpoint_path, device=device)
_ = model.eval()

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [20]:
# Custom Dataset class
class HistopathologyDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.label_map = {
            'adenosis': 0,
            'fibroadenoma': 1,
            'phyllodes_tumor': 2,
            'tubular_adenoma': 3,
            'ductal_carcinoma': 4,
            'lobular_carcinoma': 5,
            'mucinous_carcinoma': 6,
            'papillary_carcinoma': 7
        }  # Example mapping of subclasses to numerical labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx]['filename']
        label_name = img_path.split('/')[-3]  # Extract the subclass from the file path
        label = self.label_map[label_name]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label



In [11]:
# Define transformations for train and test sets
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])


In [13]:
# Load the CSV file
csv_file = r"C:\Users\Vivian\Documents\breakhis\Folds.csv"  # Replace with the actual CSV file path
data = pd.read_csv(csv_file)


In [14]:
# Split into train and test sets based on the 'grp' column
train_data = data[data['grp'] == 'train']
test_data = data[data['grp'] == 'test']

In [15]:
# Save train and test splits (optional)
train_data.to_csv('train_split.csv', index=False)
test_data.to_csv('test_split.csv', index=False)

In [16]:
# Create datasets
train_dataset = HistopathologyDataset('train_split.csv', transform=transform)
test_dataset = HistopathologyDataset('test_split.csv', transform=transform)


In [17]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [18]:
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

Number of training samples: 25880
Number of testing samples: 13665


In [19]:
# Load the model from "create_model_from_pretrained"
model_cfg = 'conch_ViT-B-16'
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint_path = 'checkpoints/CONCH/pytorch_model.bin'
checkpoint_path = 'C:\\Users\\Vivian\\Documents\\CONCH\\checkpoints\\conch\\pytorch_model.bin' 
model, preprocess = create_model_from_pretrained(model_cfg, checkpoint_path, device=device)
# _ = model.eval()


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [21]:
# Modify the classifier head for your multiclass classification task
if hasattr(model, 'head'):  # If the model has a head attribute
    model.head = nn.Linear(model.head.in_features, 8)  # 8 classes: the 8 subclasses of breast tumors
elif hasattr(model, 'fc'):  # If the model uses an `fc` attribute
    model.fc = nn.Linear(model.fc.in_features, 8)
else:
    raise AttributeError("Model does not have a modifiable classification head or fc layer.")


AttributeError: Model does not have a modifiable classification head or fc layer.

In [23]:
# Check if the visual component exists and has a head
if hasattr(model.visual, 'head'):
    # Replace the head with a linear layer for 8 classes
    model.visual.head = nn.Linear(in_features=768, out_features=8)  # Assuming input features = 768
else:
    raise AttributeError("The 'visual' component of the model does not have a 'head' attribute to modify.")


In [24]:
# Move the model to the GPU or CPU
model = model.to(device)

In [25]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # For multiclass classification
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Use Adam optimizer
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)  # Learning rate scheduler


In [26]:
# Training function (same as before)
def train(model, train_loader, criterion, optimizer, scheduler, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        scheduler.step()

        print(f"Epoch {epoch+1}/{epochs}, "
              f"Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Train Accuracy: {100 * correct / total:.2f}%")


In [27]:
# Test function (same as before)
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

In [28]:
# Train the model
train(model, train_loader, criterion, optimizer, scheduler, epochs=10)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Vivian\\Documents\\BreaKHis_v1\\histology_slides\\breast\\benign\\SOB\\fibroadenoma\\SOB_B_F_14-14134\\200X\\SOB_B_F-14-14134-200-015.png'