<a href="https://colab.research.google.com/github/ziijiecheng/VLM_robustness/blob/main/VLMs_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required packages

In [None]:
!pip install torch pillow openai anthropic open_clip_torch transformers requests

Collecting anthropic
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Collecting open_clip_torch
  Downloading open_clip_torch-2.31.0-py3-none-any.whl.metadata (31 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  D

In [None]:
import base64
import torch
from PIL import Image
from pathlib import Path
import requests
from io import BytesIO
import time

# Import model-specific libraries
from openai import OpenAI
import anthropic
from open_clip import create_model_from_pretrained, get_tokenizer
from transformers import AutoModel, AutoTokenizer, MllamaForConditionalGeneration, AutoProcessor

In [None]:
def encode_image(image_path):
    """Encode the image to a base64 string."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

MODEL FUNCTIONS

In [None]:
#GPT
def evaluate_image_chatgpt(client, model, image_path, role, content, max_retries=2):
    """Evaluate the image using ChatGPT with retry logic."""
    base64_image = encode_image(image_path)

    for attempt in range(max_retries + 1):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": role},
                    {"role": "user", "content": [
                        {"type": "text", "text": content},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
                    ]}
                ],
                temperature=0.0,
            )
            return response.choices[0].message.content
        except Exception as e:
            if attempt < max_retries:
                print(f"Error occurred with ChatGPT: {e}. Retrying... (Attempt {attempt + 1} of {max_retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
            else:
                raise Exception(f"Failed to evaluate image with ChatGPT after {max_retries + 1} attempts: {image_path}")

In [None]:
#Claude
def evaluate_image_claude(client, model, image_path, role, content, max_retries=2):
    """Evaluate the image using Claude with retry logic."""
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode("utf-8")

    for attempt in range(max_retries + 1):
        try:
            message = client.messages.create(
                model=model,
                max_tokens=1024,
                temperature=0,
                system=role,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/png",
                                    "data": base64_image,
                                },
                            },
                            {"type": "text", "text": content},
                        ],
                    }
                ],
            )
            return message.content[0].text
        except Exception as e:
            if attempt < max_retries:
                print(f"Error occurred with Claude: {e}. Retrying... (Attempt {attempt + 1} of {max_retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
            else:
                raise Exception(f"Failed to evaluate image with Claude after {max_retries + 1} attempts: {image_path}")

In [None]:
#BiomedCLIP
def evaluate_image_biomedclip(preprocess, model, tokenizer, image_path, labels, device, context_length=256):
    """Evaluate the image using BiomedCLIP."""
    template = 'the diagnosis of this Optical Coherence Tomography is:'
    image = Image.open(image_path).convert('RGB')
    image = preprocess(image).unsqueeze(0).to(device)

    texts = tokenizer([template + l for l in labels], context_length=context_length).to(device)

    with torch.no_grad():
        image_features, text_features, logit_scale = model(image, texts)
        logits = (logit_scale * image_features @ text_features.t()).softmax(dim=-1)

    prediction_index = logits.argmax().item()
    return labels[prediction_index]

In [None]:
#Llama
def evaluate_image_llama(model, processor, image_path, content, device, max_retries=2):
    """Evaluate the image using Llama with retry logic."""
    image = Image.open(image_path).convert('RGB')
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": content}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

    for attempt in range(max_retries + 1):
        try:
            inputs = processor(
                image,
                input_text,
                add_special_tokens=False,
                return_tensors="pt"
            ).to(device)
            output = model.generate(**inputs, max_new_tokens=800)
            return processor.decode(output[0])
        except Exception as e:
            if attempt < max_retries:
                print(f"Error occurred with Llama: {e}. Retrying... (Attempt {attempt + 1} of {max_retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
            else:
                raise Exception(f"Failed to evaluate image with Llama after {max_retries + 1} attempts: {image_path}")

CONFIGURATION

In [None]:
def load_image_from_url(image_url, save_path='temp_image.png'):
    # Send GET request to fetch the image content
    response = requests.get(image_url)

    # Open the image from the response content
    image = Image.open(BytesIO(response.content)).convert('L')

    # Save the image locally so we can pass the path to apply_augmentations
    image.save(save_path)
    return save_path

# 1. Set your Google Drive file ID (shareable link)
img_url = 'https://drive.google.com/uc?export=view&id=1pmrnvXwMxgC8fpZ-sz9DJfkSHRYUkSCO'  # Replace with your actual file ID
image_path = load_image_from_url(img_url)
print(f"Downloaded image: {image_path}")


Downloaded image: temp_image.png


run GPT-4o

In [None]:
client = OpenAI(api_key='#use your own key')
model = 'gpt-4o-2024-08-06'
role = 'Medical knowledge educator'
content = "How can we identify the features of choroidal neovascularization, Diabetic macular edema and early AMD in a Optical Coherence Tomography image of a person? Imagine you are an educator tasked with helping a student identify the features of an OCT image and whether or not the image shows signs of these retinal diseases as described by those features. As an educator, conclude your answer in - 'retinal diseases' or 'normal'. Describe your reasoning in steps."
prediction = evaluate_image_chatgpt(client, model, image_path, role, content)
print(prediction)

To identify features of choroidal neovascularization (CNV), diabetic macular edema (DME), and early age-related macular degeneration (AMD) in an Optical Coherence Tomography (OCT) image, follow these steps:

1. **Choroidal Neovascularization (CNV):**
   - **Look for:** Subretinal or intraretinal fluid, and hyperreflective material beneath the retina.
   - **In the image:** Check for any abnormal blood vessels or fluid accumulation beneath the retinal layers.

2. **Diabetic Macular Edema (DME):**
   - **Look for:** Thickening of the retina and cystoid spaces within the retina.
   - **In the image:** Identify any areas where the retina appears swollen or has cyst-like spaces.

3. **Early Age-related Macular Degeneration (AMD):**
   - **Look for:** Drusen, which are small, yellowish deposits beneath the retina.
   - **In the image:** Check for any small, round, hyperreflective spots beneath the retinal pigment epithelium.

**Analysis of the Image:**
- The image shows significant cystoid s

run Claude 3.5 Sonnet 2024.6.20

In [None]:
client = anthropic.Anthropic(api_key='#use your own key')
model = 'claude-3-5-sonnet-20240620'
role = 'Medical knowledge educator'
content = "How can we identify the features of choroidal neovascularization, Diabetic macular edema and early AMD in a Optical Coherence Tomography image of a person? Imagine you are an educator tasked with helping a student identify the features of an OCT image and whether or not the image shows signs of these retinal diseases as described by those features. As an educator, conclude your answer in - 'retinal diseases' or 'normal'. Describe your reasoning in steps."
prediction = evaluate_image_claude(client, model, image_path, role, content)
print(prediction)

As an educator helping a student identify features in an OCT image, I would guide them through the following steps:

1. Orientation: This is an OCT image of the retina. The top of the image represents the inner retinal layers, while the bottom shows the choroid and sclera.

2. Normal retinal anatomy: Look for clearly defined retinal layers, a smooth retinal contour, and a uniform choroidal layer.

3. Choroidal Neovascularization (CNV) features:
   - Look for irregular elevations of the retinal pigment epithelium (RPE)
   - Check for hyper-reflective material above or below the RPE
   - Observe for any disruption of the normal retinal architecture

4. Diabetic Macular Edema (DME) features:
   - Look for retinal thickening, especially in the central macula
   - Check for intraretinal cystoid spaces (dark, fluid-filled areas)
   - Observe for loss of the normal foveal contour

5. Early Age-related Macular Degeneration (AMD) features:
   - Look for drusen (small bumps) at the level of the 

run Claude 3.5 Sonnet 2024.10.22

In [None]:
client = anthropic.Anthropic(api_key='#use your own key')
model = 'claude-3-5-sonnet-20241022'
role = 'Medical knowledge educator'
content = "How can we identify the features of choroidal neovascularization, Diabetic macular edema and early AMD in a Optical Coherence Tomography image of a person? Imagine you are an educator tasked with helping a student identify the features of an OCT image and whether or not the image shows signs of these retinal diseases as described by those features. As an educator, conclude your answer in - 'retinal diseases' or 'normal'. Describe your reasoning in steps."
prediction = evaluate_image_claude(client, model, image_path, role, content)
print(prediction)

Let me help you analyze this OCT image as an educator:

Step 1 - Basic OCT Interpretation:
- This is a cross-sectional image of the retina showing distinct layers
- The retinal layers appear relatively well-organized
- The retinal pigment epithelium (RPE) layer appears as a bright continuous line

Step 2 - Looking for CNV features:
- CNV would show fluid accumulation above or below RPE
- Would see irregular RPE elevation or disruption
- No obvious signs of subretinal or sub-RPE fluid
- No obvious neovascular membranes visible

Step 3 - Checking for DME indicators:
- DME shows retinal thickening and cystoid spaces
- Would see dark fluid-filled spaces within retinal layers
- This image shows normal retinal thickness
- No obvious intraretinal fluid collections

Step 4 - Early AMD signs:
- Would look for drusen (RPE bumps/elevations)
- Check for RPE irregularities or disruption
- Look for any hyper-reflective material
- The RPE layer appears relatively smooth and continuous
- No obvious dr

run biomedCLIP

In [None]:
labels = ["normal", "retinal diseases"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, preprocess = create_model_from_pretrained('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
tokenizer = get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
model.to(device)
model.eval()
prediction = evaluate_image_biomedclip(preprocess, model, tokenizer, image_path, labels, device)
print(prediction)

retinal diseases


run llama 3.2 11b

In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `VLM_eva` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `VLM_eva`


In [None]:
!huggingface-cli download meta-llama/Llama-3.2-11B-Vision-Instruct \
  --include "original/*" \
  --local-dir Llama-3.2-11B-Vision-Instruct

Fetching 3 files:   0% 0/3 [00:00<?, ?it/s]Fetching 3 files: 100% 3/3 [00:00<00:00, 1616.30it/s]
/content/Llama-3.2-11B-Vision-Instruct


In [None]:
content = "How can we identify the features of choroidal neovascularization, Diabetic macular edema and early AMD in a Optical Coherence Tomography image of a person? Imagine you are an educator tasked with helping a student identify the features of an OCT image and whether or not the image shows signs of these retinal diseases as described by those features. As an educator, conclude your answer in - 'retinal diseases' or 'normal'. Describe your reasoning in steps."
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MllamaForConditionalGeneration.from_pretrained(
    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")
prediction = evaluate_image_llama(model, processor, image_path, content, device)
print(prediction)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>How can we identify the features of choroidal neovascularization, Diabetic macular edema and early AMD in a Optical Coherence Tomography image of a person? Imagine you are an educator tasked with helping a student identify the features of an OCT image and whether or not the image shows signs of these retinal diseases as described by those features. As an educator, conclude your answer in -'retinal diseases' or 'normal'. Describe your reasoning in steps.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**Step 1: Identify the features of choroidal neovascularization (CNV) in an OCT image.**

*   CNV is characterized by the presence of new blood vessels growing under the retina.
*   In an OCT image, CNV appears as a hyperreflective lesion with a characteristic "flower petal" or "leaf-like" pattern.
*   The lesion may also show a hyperreflective border and a hyporeflective center.

**Step 2: Identify the features of