In [1]:
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer
import torch
from torch import nn
import os
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import tqdm
import skimage
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import timm
from huggingface_hub import login, hf_hub_download


# show all jupyter output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [None]:
class HistopathologyDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.label_map = {'FA': 0, 'PT': 1}

    def __getitem__(self, idx):
        path = self.data.iloc[idx]['image']
        label = self.label_map[self.data.iloc[idx]['class']]
        image = np.load(path)
        if image.shape[-1] == 3:
            image = np.transpose(image, (2, 0, 1))
        if image.shape[1:] != (224, 224):
            image = skimage.transform.resize(image, (3, 224, 224), anti_aliasing=True)
        image = torch.tensor(image, dtype=torch.float32)
        return image, label

    def __len__(self):
        return len(self.data)


In [None]:
class UNIEncoderOnly(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = timm.create_model(
            "vit_large_patch16_224",
            img_size=224,
            patch_size=16,
            init_values=1e-5,
            num_classes=0,
            dynamic_img_size=True,
            pretrained=False,
        )
        checkpoint_path = r"C:\Users\Vivian\Documents\CONCH\checkpoints\uni\pytorch_model.bin"
        self.model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"), strict=True)

    def forward(self, x):
        return self.model(x)


In [None]:
def extract_features(dataloader, model, device):
    model.eval()
    all_features, all_labels = [], []
    with torch.no_grad():
        for imgs, labels in tqdm.tqdm(dataloader):
            imgs = imgs.to(device)
            feats = model(imgs)
            all_features.append(feats.cpu())
            all_labels.extend(labels)
    return torch.cat(all_features).numpy(), np.array(all_labels)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load model + data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = UNIEncoderOnly().to(device)

train_loader = DataLoader(HistopathologyDataset("train.csv"), batch_size=32)
val_loader = DataLoader(HistopathologyDataset("val.csv"), batch_size=32)
test_loader = DataLoader(HistopathologyDataset("test.csv"), batch_size=32)

# Feature extraction
X_train, y_train = extract_features(train_loader, encoder, device)
X_val, y_val = extract_features(val_loader, encoder, device)
X_test, y_test = extract_features(test_loader, encoder, device)

# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["FA", "PT"]))


Extracting features for Clean lab - using UNI

In [4]:
import os
import torch
import numpy as np
import pandas as pd
import tqdm
from torch.utils.data import Dataset, DataLoader
import skimage.transform
import timm

# -----------------------------
# Dataset
# -----------------------------
class PatchDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.label_map = {'FA': 0, 'PT': 1}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        patch = np.load(path)

        if patch.shape[-1] == 3:
            patch = np.transpose(patch, (2, 0, 1))
        if patch.shape[1:] != (224, 224):
            patch = skimage.transform.resize(patch, (3, 224, 224), anti_aliasing=True)
        patch = torch.tensor(patch, dtype=torch.float32)
        label = self.label_map[self.df.iloc[idx]['label']]
        return patch, label, path

# -----------------------------
# UNI Backbone
# -----------------------------
class UNIEncoderOnly(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = timm.create_model(
            "vit_large_patch16_224",
            img_size=224,
            patch_size=16,
            init_values=1e-5,
            num_classes=0,
            dynamic_img_size=True,
            pretrained=False,
        )
        ckpt = r"C:\Users\Vivian\Documents\CONCH\checkpoints\uni\pytorch_model.bin"
        self.model.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=True)

    def forward(self, x):
        return self.model(x)

# -----------------------------
# Extract + Save
# -----------------------------
def extract_and_save_embeddings(csv_path, output_path):
    dataset = PatchDataset(csv_path)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    encoder = UNIEncoderOnly().to(device)
    encoder.eval()

    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm.tqdm(loader, desc="Extracting UNI embeddings"):
            images, _, _ = batch  # unpack
            images = images.to(device)
            features = encoder(images)  # [B, 1024]
            all_embeddings.append(features.cpu().numpy())

    feature_array = np.concatenate(all_embeddings, axis=0)
    # ✅ Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    np.save(output_path, feature_array)
    print(f"✅ Saved embeddings to {output_path} with shape {feature_array.shape}")

# -----------------------------
# Run
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
extract_and_save_embeddings(
    csv_path=r"C:\Users\Vivian\Documents\cleanlab\patch_metadata.csv",
    output_path=r"C:\Users\Vivian\Documents\CONCH\embeddings\UNI_test_features.npy"
)


  self.model.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=True)
Extracting UNI embeddings: 100%|██████████| 16509/16509 [1:54:29<00:00,  2.40it/s] 


✅ Saved embeddings to C:\Users\Vivian\Documents\CONCH\embeddings\UNI_test_features.npy with shape (528284, 1024)


Extracting features for CL - using CONCH

In [2]:
from conch.open_clip_custom import create_model_from_pretrained

import os
import torch
import numpy as np
import pandas as pd
import tqdm
from torch.utils.data import Dataset, DataLoader
import skimage.transform
import timm

# -----------------------------
# Dataset
# -----------------------------
class PatchDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.label_map = {'FA': 0, 'PT': 1}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        patch = np.load(path)

        if patch.shape[-1] == 3:
            patch = np.transpose(patch, (2, 0, 1))
        if patch.shape[1:] != (224, 224):
            patch = skimage.transform.resize(patch, (3, 224, 224), anti_aliasing=True)
        patch = torch.tensor(patch, dtype=torch.float32)
        label = self.label_map[self.df.iloc[idx]['label']]
        return patch, label, path

# -----------------------------
# CONCH Backbone
# -----------------------------

class CONCHEncoderOnly(torch.nn.Module):
    def __init__(self):
        super().__init__()
        model_cfg = "conch_ViT-B-16"
        ckpt_path = r"C:\Users\Vivian\Documents\CONCH\checkpoints\conch\pytorch_model.bin"
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model, _ = create_model_from_pretrained(model_cfg, ckpt_path, device=device)

    def forward(self, x):
        feats, _ = self.model.visual(x)  # [B, 512]
        return feats

# -----------------------------
# Extract + Save
# -----------------------------
def extract_and_save_embeddings(csv_path, output_path):
    dataset = PatchDataset(csv_path)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    encoder = CONCHEncoderOnly().to(device)
    encoder.eval()

    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm.tqdm(loader, desc="Extracting CONCH embeddings"):
            images, _, _ = batch  # unpack
            images = images.to(device)
            features = encoder(images)  # [B, 1024]
            all_embeddings.append(features.cpu().numpy())

    feature_array = np.concatenate(all_embeddings, axis=0)
    # ✅ Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    np.save(output_path, feature_array)
    print(f"✅ Saved embeddings to {output_path} with shape {feature_array.shape}")

# -----------------------------
# Run
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
extract_and_save_embeddings(
    csv_path=r"C:\Users\Vivian\Documents\cleanlab\patch_metadata.csv",
    output_path=r"C:\Users\Vivian\Documents\CONCH\embeddings\CONCH_test_features.npy"
)


  checkpoint = torch.load(checkpoint_path, map_location=map_location)
Extracting CONCH embeddings: 100%|██████████| 16509/16509 [59:20<00:00,  4.64it/s] 


✅ Saved embeddings to C:\Users\Vivian\Documents\CONCH\embeddings\CONCH_test_features.npy with shape (528284, 512)


zeroshot classification?

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.metrics import classification_report
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer


# -------------------------------
# PatchDataset
# -------------------------------
class PatchDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.label_map = {'FA': 0, 'PT': 1}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        label = self.label_map[self.df.iloc[idx]['class']]
        patch = np.load(path)

        if patch.shape[-1] == 3:
            patch = np.transpose(patch, (2, 0, 1))

        patch = torch.tensor(patch, dtype=torch.float32)
        if patch.shape[1:] != (224, 224):
            patch = F.interpolate(patch.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)

        return patch, label, path

# updated dataset class for our private dataset with numpy files

import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

class HistopathologyDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Custom PyTorch Dataset for loading histopathology patches from .npy files.
        
        Args:
            csv_file (str): Path to the dataset metadata CSV file.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        # Mapping FA -> 0, PT -> 1
        self.label_map = {'FA': 0, 'PT': 1}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load image patch
        img_path = self.data.iloc[idx]['image']
        image = np.load(img_path)  # Load .npy file (already in NumPy format)

        # Ensure image is in (C, H, W) format for PyTorch
        if image.shape[-1] == 3:  # Check if image is in (H, W, C) format
            image = np.transpose(image, (2, 0, 1))  # Convert to (C, H, W)

        # Resize to 224x224 if needed
        if image.shape[1] != 224 or image.shape[2] != 224:
            import skimage.transform
            image = skimage.transform.resize(image, (3, 224, 224), anti_aliasing=True)
        
        # Normalize pixel values
        image = torch.tensor(image, dtype=torch.float32)

        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)

        # Get label
        class_name = self.data.iloc[idx]['class']
        label = self.label_map[class_name]  # Convert class name to label

        return image, label, img_path
        # return image, label

# -------------------------------
import os
import torch
import torch.nn as nn
import timm
from huggingface_hub import login, hf_hub_download

# -------------------------------
# UNI Model as Feature Extractor
# -------------------------------
class UNIEncoder(nn.Module):
    def __init__(self, checkpoint_path=None):
        super().__init__()
        self.model = self.make_uni()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

        if checkpoint_path:
            print(f"Loading checkpoint from: {checkpoint_path}")
            self.model.load_state_dict(torch.load(checkpoint_path, map_location='cuda'), strict=True)

    def make_uni(self):
        local_dir = r"C:\Users\Vivian\Documents\CONCH\checkpoints\uni"  # Your UNI checkpoint path
        os.makedirs(local_dir, exist_ok=True)

        model = timm.create_model(
            "vit_large_patch16_224", img_size=224, patch_size=16, init_values=1e-5,
            num_classes=0, dynamic_img_size=True
        )
        model.load_state_dict(
            torch.load(os.path.join(local_dir, "pytorch_model.bin"), map_location="cpu"),
            strict=True
        )
        return model

    def forward(self, x):
        return self.model(x)  # Only returns the encoded features (no classification)
    
# conch

import os
import torch
import torch.nn as nn
from conch.open_clip_custom import create_model_from_pretrained

# -------------------------------
# CONCH Model as Feature Extractor
# -------------------------------
class CONCHEncoder(nn.Module):
    def __init__(self, model_cfg='conch_ViT-B-16', checkpoint_path=None):
        super().__init__()
        self.model_cfg = model_cfg
        self.checkpoint_path = checkpoint_path or r'C:\Users\Vivian\Documents\CONCH\checkpoints\conch\pytorch_model.bin'
        self.model = self.make_conch()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

    def make_conch(self):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model, preprocess = create_model_from_pretrained(
            self.model_cfg,
            self.checkpoint_path,
            device=device
        )
        return model

    def forward(self, x):
        out, _ = self.model.visual(x)  # Extract only the image features
        return out

# -------------------------------
# uni2 Model as Feature Extractor
# -------------------------------
import os
import torch
import torch.nn as nn
import timm

class UNI2Encoder(nn.Module):
    def __init__(self, checkpoint_path=None):
        super().__init__()
        self.model = self.make_uni2(checkpoint_path)

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

    def make_uni2(self, checkpoint_path):
        local_dir = checkpoint_path or r"C:\Users\Vivian\Documents\UNI2\UNI\assets\ckpts\uni2-h"
        os.makedirs(local_dir, exist_ok=True)

        timm_kwargs = {
            'model_name': 'vit_giant_patch14_224',
            'img_size': 224,
            'patch_size': 14,
            'depth': 24,
            'num_heads': 24,
            'init_values': 1e-5,
            'embed_dim': 1536,
            'mlp_ratio': 2.66667 * 2,
            'num_classes': 0,
            'no_embed_class': True,
            'mlp_layer': timm.layers.SwiGLUPacked,
            'act_layer': torch.nn.SiLU,
            'reg_tokens': 8,
            'dynamic_img_size': True
        }

        model = timm.create_model(**timm_kwargs)
        ckpt_path = os.path.join(local_dir, "pytorch_model.bin")
        model.load_state_dict(torch.load(ckpt_path, map_location="cuda"), strict=True)

        return model

    def forward(self, x):
        return self.model(x)  # Output is raw image embedding [B, 1536]


# -------------------------------
# Load Model for Evaluation
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = UNIEncoder().to(device)
model = UNI2Encoder().to(device)

model.eval()


# -------------------------------
# Load Support Set & Build Prototypes
# -------------------------------
def build_class_prototypes(support_loader, model):
    model.eval()
    features, labels = [], []

    with torch.no_grad():
        for images, lbls, _ in tqdm.tqdm(support_loader, desc="Extracting support embeddings"):
            images = images.to(device)
            # emb = encoder(images)
            emb = model(images.to(device))  # [B, 1024]
            features.append(emb.cpu())
            labels.append(lbls)

    features = torch.cat(features)
    labels = torch.cat(labels)

    class_prototypes = []
    for cls in sorted(torch.unique(labels)):
        class_feats = features[labels == cls]
        proto = class_feats.mean(dim=0)
        class_prototypes.append(proto)

    return torch.stack(class_prototypes).to(device)  # shape: [num_classes, D]


# -------------------------------
# Load Test Set
# -------------------------------
support_csv = "metadata\\patient_split_annotate\\patch_csv\\ref_features_try1.csv"  # CSV with few labeled examples per class
test_csv = "metadata\\patient_split_annotate\\patch_csv\\test_patches.csv"        # CSV with test patches to classify

support_loader = DataLoader(HistopathologyDataset(support_csv), batch_size=32, shuffle=False)
test_loader = DataLoader(HistopathologyDataset(test_csv), batch_size=32, shuffle=False)

# Compute reference features
print("📌 Building class prototypes...")
prototypes = build_class_prototypes(support_loader, model)


# -------------------------------
# Predict Test Patches
# -------------------------------
def predict_with_prototypes(test_loader, model, prototypes):
    model.eval()
    all_preds, all_labels, all_paths = [], [], []

    with torch.no_grad():
        for images, labels, paths in tqdm.tqdm(test_loader, desc="Classifying test set"):
            images = images.to(device)
            feats = model(images)
            feats = F.normalize(feats, dim=1)
            proto_norm = F.normalize(prototypes, dim=1)

            sim = feats @ proto_norm.T  # cosine similarity
            pred = torch.argmax(sim, dim=1)

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_paths.extend(paths)

    return all_preds, all_labels, all_paths


# Run zero-shot classification
preds, true_labels, file_paths = predict_with_prototypes(test_loader, model, prototypes)

# -------------------------------
# Save & Evaluate
# -------------------------------
df = pd.DataFrame({
    "Patch Path": file_paths,
    "Predicted": preds,
    "True Label": true_labels
})
df.to_csv("uni2_zero_shot_predictions_run1.csv", index=False)

print("📊 Classification Report:")
print(classification_report(true_labels, preds, target_names=["FA", "PT"]))


  model.load_state_dict(torch.load(ckpt_path, map_location="cuda"), strict=True)


📌 Building class prototypes...


Extracting support embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
Classifying test set: 100%|██████████| 3310/3310 [52:19<00:00,  1.05it/s]

📊 Classification Report:
              precision    recall  f1-score   support

          FA       0.75      0.49      0.59     81825
          PT       0.20      0.44      0.28     24077

    accuracy                           0.48    105902
   macro avg       0.48      0.46      0.44    105902
weighted avg       0.62      0.48      0.52    105902






: 

conch zs

In [4]:
import os
import torch
import numpy as np
import pandas as pd
import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.metrics import classification_report
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer


# -------------------------------
# PatchDataset
# -------------------------------
class PatchDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.label_map = {'FA': 0, 'PT': 1}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        label = self.label_map[self.df.iloc[idx]['class']]
        patch = np.load(path)

        if patch.shape[-1] == 3:
            patch = np.transpose(patch, (2, 0, 1))

        patch = torch.tensor(patch, dtype=torch.float32)
        if patch.shape[1:] != (224, 224):
            patch = F.interpolate(patch.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)

        return patch, label, path

# updated dataset class for our private dataset with numpy files

import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

class HistopathologyDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Custom PyTorch Dataset for loading histopathology patches from .npy files.
        
        Args:
            csv_file (str): Path to the dataset metadata CSV file.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        # Mapping FA -> 0, PT -> 1
        self.label_map = {'FA': 0, 'PT': 1}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load image patch
        img_path = self.data.iloc[idx]['image']
        image = np.load(img_path)  # Load .npy file (already in NumPy format)

        # Ensure image is in (C, H, W) format for PyTorch
        if image.shape[-1] == 3:  # Check if image is in (H, W, C) format
            image = np.transpose(image, (2, 0, 1))  # Convert to (C, H, W)

        # Resize to 224x224 if needed
        if image.shape[1] != 224 or image.shape[2] != 224:
            import skimage.transform
            image = skimage.transform.resize(image, (3, 224, 224), anti_aliasing=True)
        
        # Normalize pixel values
        image = torch.tensor(image, dtype=torch.float32)

        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)

        # Get label
        class_name = self.data.iloc[idx]['class']
        label = self.label_map[class_name]  # Convert class name to label

        return image, label, img_path
        # return image, label

# -------------------------------
import os
import torch
import torch.nn as nn
import timm
from huggingface_hub import login, hf_hub_download

# -------------------------------
# UNI Model as Feature Extractor
# -------------------------------
class UNIEncoder(nn.Module):
    def __init__(self, checkpoint_path=None):
        super().__init__()
        self.model = self.make_uni()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

        if checkpoint_path:
            print(f"Loading checkpoint from: {checkpoint_path}")
            self.model.load_state_dict(torch.load(checkpoint_path, map_location='cuda'), strict=True)

    def make_uni(self):
        local_dir = r"C:\Users\Vivian\Documents\CONCH\checkpoints\uni"  # Your UNI checkpoint path
        os.makedirs(local_dir, exist_ok=True)

        model = timm.create_model(
            "vit_large_patch16_224", img_size=224, patch_size=16, init_values=1e-5,
            num_classes=0, dynamic_img_size=True
        )
        model.load_state_dict(
            torch.load(os.path.join(local_dir, "pytorch_model.bin"), map_location="cpu"),
            strict=True
        )
        return model

    def forward(self, x):
        return self.model(x)  # Only returns the encoded features (no classification)
    
# conch

import os
import torch
import torch.nn as nn
from conch.open_clip_custom import create_model_from_pretrained

# -------------------------------
# CONCH Model as Feature Extractor
# -------------------------------
class CONCHEncoder(nn.Module):
    def __init__(self, model_cfg='conch_ViT-B-16', checkpoint_path=None):
        super().__init__()
        self.model_cfg = model_cfg
        self.checkpoint_path = checkpoint_path or r'C:\Users\Vivian\Documents\CONCH\checkpoints\conch\pytorch_model.bin'
        self.model = self.make_conch()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

    def make_conch(self):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model, preprocess = create_model_from_pretrained(
            self.model_cfg,
            self.checkpoint_path,
            device=device
        )
        return model

    # def forward(self, x):
    #     out, _ = self.model.visual(x)  # Extract only the image features
    #     return out
    
    def forward(self, x, proj_contrast=False, normalize=False):
        return self.model.encode_image(x, proj_contrast=proj_contrast, normalize=normalize)


# -------------------------------
# Load Model for Evaluation
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = UNIEncoder().to(device)
model = CONCHEncoder().to(device)

model.eval()


# -------------------------------
# Load Support Set & Build Prototypes
# -------------------------------
def build_class_prototypes(support_loader, model):
    model.eval()
    features, labels = [], []

    with torch.no_grad():
        for images, lbls, _ in tqdm.tqdm(support_loader, desc="Extracting support embeddings"):
            images = images.to(device)
            # emb = encoder(images)
            emb = model(images.to(device))  # [B, 1024]
            features.append(emb.cpu())
            labels.append(lbls)

    features = torch.cat(features)
    labels = torch.cat(labels)

    class_prototypes = []
    for cls in sorted(torch.unique(labels)):
        class_feats = features[labels == cls]
        proto = class_feats.mean(dim=0)
        class_prototypes.append(proto)

    return torch.stack(class_prototypes).to(device)  # shape: [num_classes, D]


# -------------------------------
# Load Test Set
# -------------------------------
support_csv = "metadata\\patient_split_annotate\\patch_csv\\ref_features_try1.csv"  # CSV with few labeled examples per class
test_csv = "metadata\\patient_split_annotate\\patch_csv\\test_patches.csv"        # CSV with test patches to classify

support_loader = DataLoader(HistopathologyDataset(support_csv), batch_size=32, shuffle=False)
test_loader = DataLoader(HistopathologyDataset(test_csv), batch_size=32, shuffle=False)

# Compute reference features
print("📌 Building class prototypes...")
prototypes = build_class_prototypes(support_loader, model)


# -------------------------------
# Predict Test Patches
# -------------------------------
def predict_with_prototypes(test_loader, model, prototypes):
    model.eval()
    all_preds, all_labels, all_paths = [], [], []

    with torch.no_grad():
        for images, labels, paths in tqdm.tqdm(test_loader, desc="Classifying test set"):
            images = images.to(device)
            # feats = model(images)
            feats = model(images.to(device), proj_contrast=False, normalize=False)
            feats = F.normalize(feats, dim=1)
            proto_norm = F.normalize(prototypes, dim=1)

            sim = feats @ proto_norm.T  # cosine similarity
            pred = torch.argmax(sim, dim=1)

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_paths.extend(paths)

    return all_preds, all_labels, all_paths


# Run zero-shot classification
preds, true_labels, file_paths = predict_with_prototypes(test_loader, model, prototypes)

# -------------------------------
# Save & Evaluate
# -------------------------------
df = pd.DataFrame({
    "Patch Path": file_paths,
    "Predicted": preds,
    "True Label": true_labels
})
df.to_csv("conch_zero_shot_predictions_run1.csv", index=False)

print("📊 Classification Report:")
print(classification_report(true_labels, preds, target_names=["FA", "PT"]))


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


📌 Building class prototypes...


Extracting support embeddings: 100%|██████████| 1/1 [00:00<00:00, 14.26it/s]
Classifying test set: 100%|██████████| 3310/3310 [09:51<00:00,  5.59it/s]


📊 Classification Report:
              precision    recall  f1-score   support

          FA       0.79      0.68      0.73     81825
          PT       0.26      0.39      0.31     24077

    accuracy                           0.62    105902
   macro avg       0.53      0.54      0.52    105902
weighted avg       0.67      0.62      0.64    105902



conch zs val set

In [5]:
import os
import torch
import numpy as np
import pandas as pd
import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.metrics import classification_report
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer


# -------------------------------
# PatchDataset
# -------------------------------
class PatchDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.label_map = {'FA': 0, 'PT': 1}
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        label = self.label_map[self.df.iloc[idx]['class']]
        patch = np.load(path)

        if patch.shape[-1] == 3:
            patch = np.transpose(patch, (2, 0, 1))

        patch = torch.tensor(patch, dtype=torch.float32)
        if patch.shape[1:] != (224, 224):
            patch = F.interpolate(patch.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)

        return patch, label, path

# updated dataset class for our private dataset with numpy files

import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset

class HistopathologyDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Custom PyTorch Dataset for loading histopathology patches from .npy files.
        
        Args:
            csv_file (str): Path to the dataset metadata CSV file.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        # Mapping FA -> 0, PT -> 1
        self.label_map = {'FA': 0, 'PT': 1}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load image patch
        img_path = self.data.iloc[idx]['image']
        image = np.load(img_path)  # Load .npy file (already in NumPy format)

        # Ensure image is in (C, H, W) format for PyTorch
        if image.shape[-1] == 3:  # Check if image is in (H, W, C) format
            image = np.transpose(image, (2, 0, 1))  # Convert to (C, H, W)

        # Resize to 224x224 if needed
        if image.shape[1] != 224 or image.shape[2] != 224:
            import skimage.transform
            image = skimage.transform.resize(image, (3, 224, 224), anti_aliasing=True)
        
        # Normalize pixel values
        image = torch.tensor(image, dtype=torch.float32)

        # Apply transformations if provided
        if self.transform:
            image = self.transform(image)

        # Get label
        class_name = self.data.iloc[idx]['class']
        label = self.label_map[class_name]  # Convert class name to label

        return image, label, img_path
        # return image, label

# -------------------------------
import os
import torch
import torch.nn as nn
import timm
from huggingface_hub import login, hf_hub_download

# -------------------------------
# UNI Model as Feature Extractor
# -------------------------------
class UNIEncoder(nn.Module):
    def __init__(self, checkpoint_path=None):
        super().__init__()
        self.model = self.make_uni()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

        if checkpoint_path:
            print(f"Loading checkpoint from: {checkpoint_path}")
            self.model.load_state_dict(torch.load(checkpoint_path, map_location='cuda'), strict=True)

    def make_uni(self):
        local_dir = r"C:\Users\Vivian\Documents\CONCH\checkpoints\uni"  # Your UNI checkpoint path
        os.makedirs(local_dir, exist_ok=True)

        model = timm.create_model(
            "vit_large_patch16_224", img_size=224, patch_size=16, init_values=1e-5,
            num_classes=0, dynamic_img_size=True
        )
        model.load_state_dict(
            torch.load(os.path.join(local_dir, "pytorch_model.bin"), map_location="cpu"),
            strict=True
        )
        return model

    def forward(self, x):
        return self.model(x)  # Only returns the encoded features (no classification)
    
# conch

import os
import torch
import torch.nn as nn
from conch.open_clip_custom import create_model_from_pretrained

# -------------------------------
# CONCH Model as Feature Extractor
# -------------------------------
class CONCHEncoder(nn.Module):
    def __init__(self, model_cfg='conch_ViT-B-16', checkpoint_path=None):
        super().__init__()
        self.model_cfg = model_cfg
        self.checkpoint_path = checkpoint_path or r'C:\Users\Vivian\Documents\CONCH\checkpoints\conch\pytorch_model.bin'
        self.model = self.make_conch()

        # Freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

    def make_conch(self):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model, preprocess = create_model_from_pretrained(
            self.model_cfg,
            self.checkpoint_path,
            device=device
        )
        return model

    # def forward(self, x):
    #     out, _ = self.model.visual(x)  # Extract only the image features
    #     return out
    
    def forward(self, x, proj_contrast=False, normalize=False):
        return self.model.encode_image(x, proj_contrast=proj_contrast, normalize=normalize)


# -------------------------------
# Load Model for Evaluation
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = UNIEncoder().to(device)
model = CONCHEncoder().to(device)

model.eval()


# -------------------------------
# Load Support Set & Build Prototypes
# -------------------------------
def build_class_prototypes(support_loader, model):
    model.eval()
    features, labels = [], []

    with torch.no_grad():
        for images, lbls, _ in tqdm.tqdm(support_loader, desc="Extracting support embeddings"):
            images = images.to(device)
            # emb = encoder(images)
            emb = model(images.to(device))  # [B, 1024]
            features.append(emb.cpu())
            labels.append(lbls)

    features = torch.cat(features)
    labels = torch.cat(labels)

    class_prototypes = []
    for cls in sorted(torch.unique(labels)):
        class_feats = features[labels == cls]
        proto = class_feats.mean(dim=0)
        class_prototypes.append(proto)

    return torch.stack(class_prototypes).to(device)  # shape: [num_classes, D]


# -------------------------------
# Load Test Set
# -------------------------------
support_csv = "metadata\\patient_split_annotate\\patch_csv\\ref_features_try1.csv"  # CSV with few labeled examples per class
test_csv = r"C:\Users\Vivian\Documents\CONCH\metadata\patient_split_annotate\patch_csv\val_patches.csv"        # CSV with test patches to classify

support_loader = DataLoader(HistopathologyDataset(support_csv), batch_size=32, shuffle=False)
test_loader = DataLoader(HistopathologyDataset(test_csv), batch_size=32, shuffle=False)

# Compute reference features
print("📌 Building class prototypes...")
prototypes = build_class_prototypes(support_loader, model)


# -------------------------------
# Predict Test Patches
# -------------------------------
def predict_with_prototypes(test_loader, model, prototypes):
    model.eval()
    all_preds, all_labels, all_paths = [], [], []

    with torch.no_grad():
        for images, labels, paths in tqdm.tqdm(test_loader, desc="Classifying test set"):
            images = images.to(device)
            # feats = model(images)
            feats = model(images.to(device), proj_contrast=False, normalize=False)
            feats = F.normalize(feats, dim=1)
            proto_norm = F.normalize(prototypes, dim=1)

            sim = feats @ proto_norm.T  # cosine similarity
            pred = torch.argmax(sim, dim=1)

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_paths.extend(paths)

    return all_preds, all_labels, all_paths


# Run zero-shot classification
preds, true_labels, file_paths = predict_with_prototypes(test_loader, model, prototypes)

# -------------------------------
# Save & Evaluate
# -------------------------------
df = pd.DataFrame({
    "Patch Path": file_paths,
    "Predicted": preds,
    "True Label": true_labels
})
df.to_csv("conch_val_zero_shot_predictions_run1.csv", index=False)

print("📊 Classification Report:")
print(classification_report(true_labels, preds, target_names=["FA", "PT"]))


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


📌 Building class prototypes...


Extracting support embeddings: 100%|██████████| 1/1 [00:00<00:00, 15.84it/s]
Classifying test set: 100%|██████████| 3307/3307 [12:10<00:00,  4.52it/s]


📊 Classification Report:
              precision    recall  f1-score   support

          FA       0.50      0.74      0.60     49412
          PT       0.61      0.35      0.45     56396

    accuracy                           0.54    105808
   macro avg       0.56      0.55      0.52    105808
weighted avg       0.56      0.54      0.52    105808



LR classifier

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load embeddings and labels
X = np.load("C:/Users/Vivian/Documents/CONCH/embeddings/UNI_test_features.npy")  # shape [N, 1024]
y = np.load("C:/Users/Vivian/Documents/CONCH/embeddings/labels.npy")             # shape [N,]

print("Loaded:", X.shape, y.shape)


In [None]:
# Optional: Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=1000, class_weight='balanced')  # 'balanced' helps if you have imbalance
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_val)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=["FA", "PT"]))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


In [None]:
X_test = np.load("UNI_test_features.npy")
y_test = np.load("test_labels.npy")

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["FA", "PT"]))
