In [1]:
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer
import torch
import os
from PIL import Image
from pathlib import Path
from tqdm import tqdm

# show all jupyter output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
root = Path('../').resolve()
os.chdir(root)

In [3]:
# Load the model from "create_model_from_pretrained"
model_cfg = 'conch_ViT-B-16'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# checkpoint_path = 'checkpoints/CONCH/pytorch_model.bin'
checkpoint_path = 'C:\\Users\\Vivian\\Documents\\CONCH\\checkpoints\\conch\\pytorch_model.bin' 
model, preprocess = create_model_from_pretrained(model_cfg, checkpoint_path, device=device)
_ = model.eval()

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [13]:
# Preprocess images and save tensors
def preprocess_image(image_path):
    """
    Load and preprocess an image for the CONCH model.
    Args:
        image_path (str): Path to the input image.
    Returns:
        torch.Tensor: Preprocessed image tensor ready for CONCH model input.
    """
    image = Image.open(image_path).convert("RGB")  # Ensure 3 channels (RGB)
    return preprocess(image).unsqueeze(0)  # Add batch dimension

def preprocess_dataset(dataset_path, output_path):
    """
    Preprocess the entire BreakHis dataset and save the tensors.
    Args:
        dataset_path (str): Path to the BreakHis dataset.
        output_path (str): Path to save the processed tensors.
    """
    os.makedirs(output_path, exist_ok=True)  # Create output directory if not exist

    for root, _, files in tqdm(os.walk(dataset_path)):
        for file in files:
            if file.endswith(".png"):
                image_path = os.path.join(root, file)
                tensor = preprocess_image(image_path)

                # Save the tensor with the same name as the image
                tensor_path = os.path.join(output_path, file.replace('.png', '.pt'))
                torch.save(tensor, tensor_path)

# Example usage
dataset_path = r"C:\Users\Vivian\Documents\breakhis\BreaKHis_v1\BreaKHis_v1\histology_slides\breast"
output_path = r"C:\Users\Vivian\Documents\CONCH\data_tensors"

preprocess_dataset(dataset_path, output_path)
print("Process Complete")

423it [02:07,  3.33it/s]

Process Complete





Cell 1: Import Libraries and Define Constants


In [4]:
import os
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from tqdm import tqdm

# Define the labels and classes
labels_list = ['B_A', 'B_F', 'B_PT', 'B_TA', 'M_DC', 'M_LC', 'M_MC', 'M_PC']
classes = ["adenosis (A)", "fibroadenoma (F)", "phyllodes tumor (PT)", "tubular adenona (TA)", "carcinoma (DC)", "lobular carcinoma (LC)", "mucinous carcinoma (MC)", "papillary carcinoma (PC)"]

# Function to extract label from filename
def extract_label_from_filename(filename, labels_list):
    for label in labels_list:
        if label in filename:
            return label
    return None

Cell 2: Perform Inference

In [5]:
def perform_inference(tensor_folder, model, classes, device):
    """
    Perform inference on preprocessed tensors and classify images.
    Args:
        tensor_folder (str): Path to the folder containing preprocessed tensors.
        model (torch.nn.Module): Pretrained model for classification.
        classes (list): List of class names.
        device (torch.device): Device to run the model on (CPU or GPU).
    
    Returns:
        dict: A dictionary with tensor filenames as keys and predicted classes as values.
    """
    predictions = {}
    image_embeddings = {}
    
    for tensor_file in tqdm(os.listdir(tensor_folder)):
        if tensor_file.endswith(".pt"):
            tensor_path = os.path.join(tensor_folder, tensor_file)
            tensor = torch.load(tensor_path).to(device)
            
            with torch.inference_mode():
                # Extract image embeddings without projection head and normalization
                image_embs = model.encode_image(tensor, proj_contrast=False, normalize=False)
                image_embeddings[tensor_file] = image_embs.cpu().numpy()
                
                # Perform image-only classification (example: nearest neighbor)
                # Here, you can implement your own classification logic using image_embs
                # For demonstration, we'll just assign a random class
                predicted_class = np.random.choice(classes)
                predictions[tensor_file] = predicted_class
    
    return predictions, image_embeddings

Cell 3: Evaluate Predictions

In [6]:
def evaluate_predictions(predictions, labels_list, classes):
    """
    Evaluate the predictions and calculate accuracy and ROC metrics.
    Args:
        predictions (dict): Dictionary with tensor filenames as keys and predicted classes as values.
        labels_list (list): List of possible labels.
        classes (list): List of class names.
    
    Returns:
        None
    """
    true_labels = []
    predicted_labels = []
    
    for filename, predicted_class in predictions.items():
        true_label = extract_label_from_filename(filename, labels_list)
        if true_label:
            true_labels.append(classes[labels_list.index(true_label)])
            predicted_labels.append(predicted_class)
    
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Calculate classification report
    report = classification_report(true_labels, predicted_labels, target_names=classes)
    print("Classification Report:")
    print(report)
    
    # Binarize the labels for ROC curve plotting
    true_labels_bin = label_binarize(true_labels, classes=classes)
    predicted_labels_bin = label_binarize(predicted_labels, classes=classes)
    
    # Calculate ROC curve and AUC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(true_labels_bin[:, i], predicted_labels_bin[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Plot ROC curve
    plt.figure()
    colors = ['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'blue', 'purple', 'brown']
    for i, color in zip(range(len(classes)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.2f})'.format(classes[i], roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

Cell 4: Example Usage

In [7]:
# Example usage
tensor_folder = r"C:\Users\Vivian\Documents\CONCH\data_tensors"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Running on device: {device}")

# Assuming model is already defined
# model = ...

# Perform inference and image-only classification
predictions, image_embeddings = perform_inference(tensor_folder, model, classes, device)

# Evaluate predictions
evaluate_predictions(predictions, labels_list, classes)

Running on device: cpu


  tensor = torch.load(tensor_path).to(device)
  1%|▏         | 116/7909 [00:22<25:37,  5.07it/s]


KeyboardInterrupt: 

In [8]:
import torch
print(torch.cuda.is_available())  # Should return True if a GPU is available

False


In [11]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA version used by PyTorch:", torch.version.cuda)


PyTorch version: 2.5.1+cpu
CUDA version used by PyTorch: None
