# Original OpenAI CLIP
From https://github.com/openai/CLIP.

```
conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
pip install ftfy regex tqdm
pip install git+https://github.com/openai/CLIP.git
```

## Simple usage

In [3]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[0.9927   0.004185 0.002968]]


## Zero shot prediction

The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the CIFAR-100 dataset, and predicts the most likely labels among the 100 textual labels from the dataset.

Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs.

In [7]:
import os
import clip
import torch
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

100%|██████████| 169M/169M [00:11<00:00, 15.2MB/s] 



Top predictions:

           snake: 65.53%
          turtle: 12.12%
    sweet_pepper: 3.87%
          lizard: 1.89%
       crocodile: 1.72%


## Linear-probe evaluation

The example below uses scikit-learn to perform logistic regression on image features.

Note that the `C` value should be determined via a hyperparameter sweep using a validation split.

In [10]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)


def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

100%|██████████| 500/500 [01:18<00:00,  6.33it/s]
100%|██████████| 100/100 [00:15<00:00,  6.31it/s]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        51300     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.60517D+00    |proj g|=  1.53246D-02


 This problem is unconstrained.



At iterate   50    f=  6.75614D-01    |proj g|=  8.49750D-03

At iterate  100    f=  5.84321D-01    |proj g|=  1.01679D-02

At iterate  150    f=  5.66909D-01    |proj g|=  3.72494D-03

At iterate  200    f=  5.63269D-01    |proj g|=  4.89657D-04

At iterate  250    f=  5.62606D-01    |proj g|=  2.58750D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
51300    290    300      1     0     0   8.677D-05   5.625D-01
  F =  0.56246845550390634     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Accuracy = 80.030


# Hugging face CLIP

From https://huggingface.co/docs/transformers/model_doc/clip.

```
pip install transformers
```

In [14]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("lung_cancer.png")
text = ["female smoker", "male nonsmoker", "adenocarcinoma", "squamous cell carcinoma"]

inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

logits_per_image = outputs.logits_per_image  # similarity score
probs = logits_per_image.softmax(dim=1)  # probabilities
print(probs)

tensor([[0.0009, 0.0070, 0.6385, 0.3536]], grad_fn=<SoftmaxBackward0>)


# Open source CLIP

From https://github.com/mlfoundations/open_clip.

```
pip install open_clip_torch
pip install 'open_clip_torch[training]'
```

In [16]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[9.9950e-01, 4.1207e-04, 8.5317e-05]])


## Training CLIP


In [1]:
import h5py

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_train.h5"

with h5py.File(file_path, 'r') as f:
    print(list(f.keys()))
    
    combined_slides_dataset = f['slides'][:]
    print(combined_slides_dataset)

print(len(combined_slides_dataset))

['img', 'labels', 'patterns', 'slides', 'tiles']
[b'TCGA-49-6745-01Z-00-DX7' b'TCGA-68-A59J-01Z-00-DX1'
 b'TCGA-95-A4VN-01Z-00-DX1' ... b'TCGA-63-5131-01Z-00-DX1'
 b'TCGA-49-AARE-01Z-00-DX1' b'TCGA-73-7498-01Z-00-DX1']
250000


In [None]:
import open_clip
import torch

model, preprocess, tokenizer = open_clip.create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained=None  # or use "laion2b_s34b_b79k" if you want to fine-tune
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
import h5py
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
import numpy as np

class HistopathologyCLIPDataset(Dataset):
    def __init__(self, h5_path, caption_csv, transform, tokenizer):
        self.h5_file = h5py.File(h5_path, 'r')
        self.images = self.h5_file['slides']
        self.captions_df = pd.read_csv(caption_csv)
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.captions_df)

    def __getitem__(self, idx):
        img = self.images[idx]  # (H, W, C) or (C, H, W)
        if img.shape[-1] == 3:  # assume (H, W, C)
            img = Image.fromarray(img.astype(np.uint8))
        else:  # (C, H, W)
            img = Image.fromarray(np.moveaxis(img, 0, -1).astype(np.uint8))

        caption = self.captions_df.iloc[idx]['caption']
        img = self.transform(img)
        text = self.tokenizer(caption)

        return img, text

In [None]:
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

dataset = HistopathologyCLIPDataset(
    h5_path="/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_train.h5",
    caption_csv="/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_images_captions.csv",
    transform=preprocess,
    tokenizer=tokenizer
)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, texts in tqdm(dataloader):
        images = images.to(device)
        texts = texts.to(device)

        image_features = model.encode_image(images)
        text_features = model.encode_text(texts)

        image_features = F.normalize(image_features, dim=-1)
        text_features = F.normalize(text_features, dim=-1)

        logits_per_image = image_features @ text_features.T
        logits_per_text = text_features @ image_features.T

        labels = torch.arange(len(images), device=device)
        loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")