# Preparation for Colab

Make sure you're running a GPU runtime; if not, select "GPU" as the hardware accelerator in Runtime > Change Runtime Type in the menu. The next cells will install the `clip` package and its dependencies, and check if PyTorch 1.7.1 or later is installed.

In [2]:
!ls

zero_samdd.ipynb  zero_shot_demoSF.ipynb


In [3]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-tdxw0tnm
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-tdxw0tnm
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone


In [4]:
import numpy as np
import torch
import clip
from tqdm.notebook import tqdm
from pkg_resources import packaging

print("Torch version:", torch.__version__)


Torch version: 2.0.1+cu117


# Loading the model

Download and instantiate a CLIP model using the `clip` module that we just installed.

In [5]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

# SAM-DD dataset

In [1]:
# Run zero-shotCLIP model on SAM-DD valid dataset
import torch
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import clip
import os

# Load CLIP models
model_list= ['ViT-L/14', 'ViT-L/14@336px', 'ViT-B/16', 'ViT-B/32']
for model_ in model_list:
    print(f"Running zero-shot on {model_}:")
    model, preprocess = clip.load(f"{model_}", device='cuda')

    # Define the image extensions
    def is_image_file(filename):
        valid_image_extensions = [".jpg", ".jpeg", ".png"]
        return any(filename.lower().endswith(ext) for ext in valid_image_extensions)

    class CustomImageFolder(ImageFolder):
        def __init__(self, root, transform=None, is_valid_file=None):
            self.is_valid_file = is_valid_file or is_image_file
            super(CustomImageFolder, self).__init__(root, transform=transform, is_valid_file=self.is_valid_file)
            self.samples = self.make_dataset_with_camera_views(self.root, is_valid_file=self.is_valid_file)
        
        def make_dataset_with_camera_views(self, dir, is_valid_file=None):
            instances = []
            dir = os.path.expanduser(dir)
            
            # Iterate through each subject directory
            for subject_dir in tqdm(sorted(os.listdir(dir)), desc='Subjects'):
                subject_path = os.path.join(dir, subject_dir)
                if not os.path.isdir(subject_path):
                    continue
                # create class_to_idx inside the method
                class_to_idx = {d: i for i, d in enumerate(sorted(os.listdir(subject_path)))}
                
                # Iterate through each class directory
                for class_name in sorted(class_to_idx.keys()):
                    class_index = class_to_idx[class_name]
                    class_dir = os.path.join(subject_path, class_name)
                    camera_angle_dir = os.path.join(class_dir, "side_RGB")  # Only using the "front_RGB" folder
                    
                    if os.path.isdir(camera_angle_dir):
                        for root, _, fnames in os.walk(camera_angle_dir):
                            for fname in fnames:
                                if is_valid_file(fname):
                                    path = os.path.join(root, fname)
                                    instances.append((path, class_index))
            return instances
        

        def __getitem__(self, index):
            img, label = super().__getitem__(index)
            path = self.samples[index][0]
            return img, label, path

    val_data_dir = f"../sam-dd/valid"
    val_dataset = CustomImageFolder(val_data_dir, transform=preprocess)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
    print(f"Number of samples in the dataset: {len(val_dataset)}")

    # Assume class_names are defined as per your dataset classes
    class_names = [ "driving safely", 
                    "drinking water while driving",
                    "talking to the phone on left hand while driving",
                    "talking to the phone on right hand while driving",
                    "texting on the phone with left hand while driving",
                    "texting on the phone with right hand while driving",
                    "touching hairs with hand while driving",
                    "adjusting glasses with hand while driving",
                    "reaching behind while driving",
                    "dropping the head while driving"]
    templates = ["an image of a person {}."]

    # Function for zero-shot prediction
    def zeroshot_classifier(classnames, templates):
        with torch.no_grad():
            zeroshot_weights = []
            for classname in tqdm(classnames):
                texts = [template.format(classname) for template in templates] #format with class
                texts = clip.tokenize(texts).cuda() #tokenize
                class_embeddings = model.encode_text(texts) #embed with text encoder
                class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
                # class_embedding = class_embeddings.mean(dim=0)
                # class_embedding /= class_embedding.norm()
                class_embedding = class_embeddings
                zeroshot_weights.append(class_embedding)
            zeroshot_weights = torch.stack(zeroshot_weights, dim=2).cuda()
        return zeroshot_weights

    zeroshot_weights = zeroshot_classifier(class_names, templates)

    print(zeroshot_weights.shape)
    # Perform zero-shot prediction
    def accuracy(output, target, topk=(1,)):
        pred = output.topk(max(topk), 1, True, True)[1].t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))
        return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

    with torch.no_grad():
        top1, top3, n = 0., 0., 0.
        for images, target, _ in tqdm(val_loader):
            images = images.cuda()
            target = target.cuda()

            # Predict
            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            logits = image_features @ zeroshot_weights
            logits = logits.squeeze(0)  # Removing the first singleton dimension

            # Measure accuracy
            acc1, acc3 = accuracy(logits, target, topk=(1, 3))
            top1 += acc1
            top3 += acc3
            n += images.size(0)

    # Calculate and print the top-1 and top-5 accuracy
    top1 = (top1 / n) * 100
    top3 = (top3 / n) * 100
    print(f"Top-1 accuracy_{model_}: {top1:.2f}%")
    print(f"Top-3 accuracy_{model_}: {top3:.2f}%")

Running zero-shot on ViT-L/14:


Subjects: 100%|██████████| 14/14 [00:00<00:00, 337.73it/s]


Number of samples in the dataset: 14336


100%|██████████| 10/10 [00:00<00:00, 14.35it/s]


torch.Size([1, 768, 10])


  return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
100%|██████████| 448/448 [02:30<00:00,  2.97it/s]


Top-1 accuracy_ViT-L/14: 62.51%
Top-3 accuracy_ViT-L/14: 84.52%
Running zero-shot on ViT-L/14@336px:


100%|███████████████████████████████████████| 891M/891M [00:18<00:00, 49.2MiB/s]
Subjects: 100%|██████████| 14/14 [00:00<00:00, 310.49it/s]


Number of samples in the dataset: 14336


100%|██████████| 10/10 [00:00<00:00, 59.80it/s]


torch.Size([1, 768, 10])


100%|██████████| 448/448 [03:06<00:00,  2.40it/s]


Top-1 accuracy_ViT-L/14@336px: 66.52%
Top-3 accuracy_ViT-L/14@336px: 86.33%
Running zero-shot on ViT-B/16:


Subjects: 100%|██████████| 14/14 [00:00<00:00, 279.65it/s]


Number of samples in the dataset: 14336


100%|██████████| 10/10 [00:00<00:00, 60.66it/s]


torch.Size([1, 512, 10])


100%|██████████| 448/448 [02:37<00:00,  2.84it/s]


Top-1 accuracy_ViT-B/16: 18.67%
Top-3 accuracy_ViT-B/16: 79.18%
Running zero-shot on ViT-B/32:


Subjects: 100%|██████████| 14/14 [00:00<00:00, 267.94it/s]


Number of samples in the dataset: 14336


100%|██████████| 10/10 [00:00<00:00, 57.90it/s]


torch.Size([1, 512, 10])


100%|██████████| 448/448 [02:44<00:00,  2.72it/s]

Top-1 accuracy_ViT-B/32: 10.48%
Top-3 accuracy_ViT-B/32: 40.09%



