# Fine-tuning CLIP

The following notebook shows two different takes at fine-tuning CLIP.

1. Fine tuning CLIP visual backbone on the RefCOCOg dataset (images)
2. Fine tuning CLIP performing contrastive learning on the RefCOCOg dataset (text + images)

## Preliminary steps

In [None]:
#@title Import necessary packages and set correct device

import clip
import numpy as np
import torch
from PIL import Image
from torch.utils.data import random_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from modules.refcocog import RefCOCOg, RefCOCOgSample

%matplotlib inline

if torch.cuda.is_available():
    device = torch.device("cuda")  # CUDA GPU
    print("[INFO] Using GPU.")
elif torch.has_mps:
    device = torch.device("mps")  # Apple Silicon GPU
    print("[INFO] Using MPS.")
else:
    device = torch.device("cpu")
    print("[INFO] No GPU found, using CPU instead.")


In [None]:
OPTIMIZERS_TO_TRY = {
    "SGD": torch.optim.SGD,
    "RMSProp": torch.optim.RMSprop,
    "Adam": torch.optim.Adam,
    "Adamax": torch.optim.Adamax,
    "Adadelta": torch.optim.Adadelta,
    # TODO: add more
}

# HYPERPARAMETERS

batch_size = 128  # 256 causes out of memory with 24GB of GPU ram
learning_rate = 0.001
momentum = 0.9
epochs = 10
optimizer = "Adam"


In [None]:
#@title Import CLIP model and show its info

clip_model, clip_prep = clip.load("RN50", device=device)

print("[INFO] Model params: {:,}".format(np.sum([int(np.prod(p.shape)) for p in clip_model.parameters()])))
print("[INFO] Trainable params: {:,}".format(sum(p.numel() for p in clip_model.parameters() if p.requires_grad)))
print("[INFO] Input resolution: ", clip_model.visual.input_resolution)
print("[INFO] Max prompt length:", clip_model.context_length)
print("[INFO] Vocab size:", clip_model.vocab_size)


Note that, solely for debuggining purposes on local machines, we are discarding most of the dataset and using only a "toy" portion of it.

In [None]:
#@title Import RefCOCOg dataset and its train/val/test splits

# data_path = "/media/dmmp/vid+backup/Data/refcocog"
data_path = "dataset/refcocog"

dataset = RefCOCOg(ds_path=data_path)

train_ds = RefCOCOg(ds_path=data_path, split='train')
val_ds = RefCOCOg(ds_path=data_path, split='val')
test_ds = RefCOCOg(ds_path=data_path, split='test')

# keep only a toy portion of each split
keep = 0.1
dataset, _ = random_split(dataset, [int(keep * len(dataset)), len(dataset) - int(keep * len(dataset))])
train_ds, _ = random_split(train_ds, [int(keep * len(train_ds)), len(train_ds) - int(keep * len(train_ds))])
val_ds, _ = random_split(val_ds, [int(keep * len(val_ds)), len(val_ds) - int(keep * len(val_ds))])
test_ds, _ = random_split(test_ds, [int(keep * len(test_ds)), len(test_ds) - int(keep * len(test_ds))])

print(f"Dataset Size: {len(dataset)}\n")
print(f"Train size: {len(train_ds)}")
print(f"Val size:   {len(val_ds)}")
print(f"Test size:  {len(test_ds)}")


In [None]:
#@title Import RefCOCOg dataset and its train/val/test splits

# modify
dataset_path = "/media/dmmp/vid+backup/Data/refcocog"

dataset = RefCOCOg(ds_path=dataset_path)

train_ds = RefCOCOg(ds_path=dataset_path, split='train')
val_ds = RefCOCOg(ds_path=dataset_path, split='val')
test_ds = RefCOCOg(ds_path=dataset_path, split='test')

keep = 0.1
dataset, _ = random_split(dataset, [int(keep * len(dataset)), len(dataset) - int(keep * len(dataset))])
train_ds, _ = random_split(train_ds, [int(keep * len(train_ds)), len(train_ds) - int(keep * len(train_ds))])
val_ds, _ = random_split(val_ds, [int(keep * len(val_ds)), len(val_ds) - int(keep * len(val_ds))])
test_ds, _ = random_split(test_ds, [int(keep * len(test_ds)), len(test_ds) - int(keep * len(test_ds))])

print(f"Dataset Size: {len(dataset)}\n")
print(f"Train size: {len(train_ds)}")
print(f"Val size:   {len(val_ds)}")
print(f"Test size:  {len(test_ds)}")


## Image-text similarity

In [None]:
#@title An example of computing images-prompts similarity

### Useless for the actual training loop, but was in Alessandro's code.
### But functions below might be helpful later ¯\(ツ)/¯

def get_data(dataset):
    texts, images = list(), list()

    for sample in tqdm(dataset, desc="[INFO] Loading images and captions"):
        sample = RefCOCOgSample(**sample)

        for sentence in sample.sentences:
            images.append(sample.path)
            texts.append(sentence)

    return images, texts


def encode_data(images_fp: list[str], texts: list[str]):
    # preprocess the images to transform from filenames to images to tensors
    images = [clip_prep(Image.open(image)) for image in tqdm(images_fp, desc="[INFO] Preprocessing images")]
    images = torch.tensor(np.stack(images)).to(device)

    # preprocess the texts to transform from text to tensors
    text_tokens = clip.tokenize(["This is " + desc for desc in tqdm(texts, desc="[INFO] Preprocessing texts")]).to(
        device)

    # encode the inputs
    with torch.no_grad():
        print("[INFO] Encoding images...")
        images_z = clip_model.encode_image(images).float()
        print("[INFO] Encoding texts...")
        texts_z = clip_model.encode_text(text_tokens).float()

    return images_z, texts_z


def cosine_similarity(images_z: torch.Tensor, texts_z: torch.Tensor):
    # normalise the image and the text
    images_z /= images_z.norm(dim=-1, keepdim=True)
    texts_z /= texts_z.norm(dim=-1, keepdim=True)

    # evaluate the cosine similarity between the sets of features
    similarity = (texts_z @ images_z.T)

    return similarity.cpu()

# images_fp, texts = get_data(test_ds)

# images_z, texts_z = encode_data(images_fp, texts)

# similarity = cosine_similarity(images_z, texts_z)

# print(similarity)


# Fine-tune by classifying on RefCOCOg classes

In the following approach we're gonna:
- create a custom CLIP architecture with an additional trainable layer
- implement training and testing logics
- train the newly created CLIP model on the images from the whole dataset

This should bring some **benefits**:
- CLIP should become *better* at extracting features from our RefCOCOg images
- [add others, if any]

and some **drawbacks** too:
- The textual encoding training is basically lost
- [add others, if any]

In [None]:
#@title Custom CLIP architecture featuring an additional fc layer

class CustomCLIP(torch.nn.Module):
    def __init__(self, num_classes: int = 10):
        super().__init__()
        model, _ = clip.load("RN50")

        # take the visual encoder of CLIP
        # we also convert it to be 32 bit (by default CLIP is 16)
        self.encoder = model.visual.float()

        # add a linear layer
        self.classifier = torch.nn.Linear(1024, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder(x)
        x = self.classifier(x)

        return x


In [None]:
#@title Training and test logics

def get_optimizer(model, lr, wd, momentum, optimizer):
    try:
        optimizer = OPTIMIZERS_TO_TRY[optimizer]([
            {'params': model.classifier.parameters(), 'lr': lr}
        ], lr=lr, weight_decay=wd, momentum=momentum)
    except TypeError:
        optimizer = OPTIMIZERS_TO_TRY[optimizer]([
            {'params': model.classifier.parameters(), 'lr': lr}
        ], lr=lr, weight_decay=wd)

    return optimizer


def get_cost_function():
    cost_function = torch.nn.CrossEntropyLoss()
    return cost_function


def training_step(net, data_loader, optimizer, cost_function, device=device):
    n_samples = 0.0
    cumulative_loss = 0.0
    cumulative_accuracy = 0.0

    # set the network to training mode
    net.train()

    # iterate over the training set
    for batch_idx, batch in enumerate(tqdm(data_loader, desc="[INFO] Training step")):

        inputs, targets = list(), list()

        for sample in batch:
            sample = RefCOCOgSample(**sample)

            prep_img = clip_prep(sample.img)

            inputs.append(prep_img)
            targets.append(sample.category_id - 1)  # so that category_ids will start from #0

        inputs = torch.stack(inputs)
        targets = torch.tensor(targets)

        inputs = inputs.to(device)
        targets = targets.to(device)

        # forward pass
        outputs = net(inputs)

        # loss computation
        loss = cost_function(outputs, targets)

        # backward pass
        loss.backward()

        # parameters update
        optimizer.step()

        # gradients reset
        optimizer.zero_grad()

        # fetch prediction and loss value
        n_samples += inputs.shape[0]
        cumulative_loss += loss.item()
        _, predicted = outputs.max(dim=1)  # max() returns (maximum_value, index_of_maximum_value)

        # compute training accuracy
        cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_loss / n_samples, cumulative_accuracy / n_samples * 100


def test_step(net, data_loader, cost_function, device=device):
    samples_ = 0.0
    cumulative_loss = 0.0
    cumulative_accuracy = 0.0

    # set the network to evaluation mode
    net.eval()

    # disable gradient computation (we are only testing, we do not want our model to be modified in this step!)
    with torch.no_grad():
        # iterate over the test set
        for batch_idx, samples in enumerate(tqdm(data_loader, desc="[INFO] Test step")):

            inputs, targets = list(), list()

            for sample in samples:
                sample = RefCOCOgSample(**sample)

                prep_img = clip_prep(sample.img)

                inputs.append(prep_img)
                targets.append(sample.category_id - 1)  # so that category_ids will start from #0

            inputs = torch.stack(inputs)
            targets = torch.tensor(targets)

            # load data into GPU
            inputs = inputs.to(device)
            targets = targets.to(device)

            # forward pass
            outputs = net(inputs)

            # loss computation
            loss = cost_function(outputs, targets)

            # fetch prediction and loss value
            samples_ += inputs.shape[0]
            cumulative_loss += loss.item()  # Note: the .item() is needed to extract scalars from tensors
            _, predicted = outputs.max(1)

            # compute accuracy
            cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_loss / samples_, cumulative_accuracy / samples_ * 100


In [None]:
#@title Main training loop logic

# tensorboard logging utilities
def log_values(writer, step, loss, accuracy, prefix):
    writer.add_scalar(f"{prefix}/loss", loss, step)
    writer.add_scalar(f"{prefix}/accuracy", accuracy, step)


def training_loop(train_ds,
                  val_ds,
                  test_ds,
                  batch_size=batch_size,
                  num_classes=90,  # 90 classes in RefCOCOg
                  device=device,
                  learning_rate=learning_rate,
                  weight_decay=0.000001,
                  momentum=momentum,
                  epochs=epochs,
                  optimizer=optimizer):
    # create a logger for the experiment
    writer = SummaryWriter(log_dir="runs/exp1")

    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)

    # instantiate the network and move it to the chosen device (GPU)
    net = CustomCLIP(num_classes=num_classes).to(device)

    # instantiate the optimizer
    optimizer = get_optimizer(net, learning_rate, weight_decay, momentum, optimizer)

    # define the cost function
    cost_function = get_cost_function()

    # computes evaluation results before training
    print('Before training:')
    train_loss, train_accuracy = test_step(net, train_loader, cost_function)
    val_loss, val_accuracy = test_step(net, val_loader, cost_function)
    test_loss, test_accuracy = test_step(net, test_loader, cost_function)

    # log to TensorBoard
    log_values(writer, -1, train_loss, train_accuracy, "train")
    log_values(writer, -1, val_loss, val_accuracy, "validation")
    log_values(writer, -1, test_loss, test_accuracy, "test")

    print('\tTraining loss {:.5f}, Training accuracy {:.2f}'.format(train_loss, train_accuracy))
    print('\tValidation loss {:.5f}, Validation accuracy {:.2f}'.format(val_loss, val_accuracy))
    print('\tTest loss {:.5f}, Test accuracy {:.2f}'.format(test_loss, test_accuracy))
    print('-----------------------------------------------------')

    # for each epoch, train the network and then compute evaluation results
    for e in range(epochs):
        train_loss, train_accuracy = training_step(net, train_loader, optimizer, cost_function)
        val_loss, val_accuracy = test_step(net, val_loader, cost_function)

        # logs to TensorBoard
        log_values(writer, e, val_loss, val_accuracy, "Validation")

        print('Epoch: {:d}'.format(e + 1))
        print('\tTraining loss {:.5f}, Training accuracy {:.2f}'.format(train_loss, train_accuracy))
        print('\tValidation loss {:.5f}, Validation accuracy {:.2f}'.format(val_loss, val_accuracy))
        print('-----------------------------------------------------')

    # compute final evaluation results
    print('After training:')
    train_loss, train_accuracy = test_step(net, train_loader, cost_function)
    val_loss, val_accuracy = test_step(net, val_loader, cost_function)
    test_loss, test_accuracy = test_step(net, test_loader, cost_function)

    # log to TensorBoard
    log_values(writer, epochs, train_loss, train_accuracy, "train")
    log_values(writer, epochs, val_loss, val_accuracy, "validation")
    log_values(writer, epochs, test_loss, test_accuracy, "test")

    print('\tTraining loss {:.5f}, Training accuracy {:.2f}'.format(train_loss, train_accuracy))
    print('\tValidation loss {:.5f}, Validation accuracy {:.2f}'.format(val_loss, val_accuracy))
    print('\tTest loss {:.5f}, Test accuracy {:.2f}'.format(test_loss, test_accuracy))
    print('-----------------------------------------------------')

    # closes the logger
    writer.close()


In [None]:
#@title Execute main training loop

training_loop(train_ds, val_ds, test_ds)


---

 # Fine-tune by contrastive learning on objects+texts

In this other approach we're gonna:
- import the original CLIP, without adding other layers on top
- implement the contrastive loss logic and adapt it into the training and test step defined earlier
- train the model on all images - caption pairs in the dataset, using the contrastive loss

Note that when speaking of images, we do not mean the whole sample images, but the images cropped at the ground truth bbox. In other words, we are training CLIP to maximize the similarity between the embedding of each image of every object refered by the dataset and the corresponding (1 or more) caption/s.

This should bring some **benefits**:
- CLIP should become *better* at extracting embeddings for the images and captions of RefCOCOg.
- [add others, if any]

and some **drawbacks** too:
- as before, CLIP zero-shot capabilities would be basically lost
- [add others, if any]

In [None]:
#@title Import CLIP model and show its info

clip_model, clip_prep = clip.load("RN50", device=device)

print("[INFO] Model params: {:,}".format(np.sum([int(np.prod(p.shape)) for p in clip_model.parameters()])))
print("[INFO] Trainable params: {:,}".format(sum(p.numel() for p in clip_model.parameters() if p.requires_grad)))
print("[INFO] Input resolution: ", clip_model.visual.input_resolution)
print("[INFO] Max prompt length:", clip_model.context_length)
print("[INFO] Vocab size:", clip_model.vocab_size)


In [None]:
#@title Contrastive loss function definition

def contrastive_loss(image_logits, text_logits, cost_function):
    labels = np.arange(image_logits.shape[0])
    labels = torch.from_numpy(labels).to(device)

    loss_i = cost_function(image_logits, labels)
    loss_t = cost_function(text_logits, labels)

    return (loss_i + loss_t) / 2.0


def get_optimizer(model, lr, wd, momentum, optimizer):
    try:
        optimizer = OPTIMIZERS_TO_TRY[optimizer]([
            {'params': model.visual.layer4.parameters(), 'lr': lr}
        ], lr=lr, weight_decay=wd, momentum=momentum)
    except TypeError:
        optimizer = OPTIMIZERS_TO_TRY[optimizer]([
            {'params': model.visual.layer4.parameters(), 'lr': lr}
        ], lr=lr, weight_decay=wd)

    return optimizer


In [None]:
#@title Slight modifications to the aforementioned logics


def training_step_cl(net, data_loader, optimizer, cost_function, device=device):
    n_samples = 0.0
    cumulative_loss = 0.0

    # set the network to training mode
    net.train()

    for batch_idx, batch in enumerate(tqdm(data_loader, desc="[INFO] Training step")):

        images, texts = list(), list()

        for sample in batch:
            sample = RefCOCOgSample(**sample)

            for sentence in sample.sentences:
                prep_img = sample.img.crop(sample.bbox)
                prep_img = clip_prep(prep_img)

                images.append(prep_img)
                texts.append(sentence)

        texts = clip.tokenize(texts).to(device)
        images = torch.stack(images).to(device)

        images = images.to(device)
        texts = texts.to(device)

        # forward pass
        image_logits, text_logits = net(images, texts)

        # loss computation
        loss = contrastive_loss(image_logits, text_logits, cost_function)

        # backward pass
        loss.backward()

        # parameters update
        optimizer.step()

        # gradients reset
        optimizer.zero_grad()

        # fetch loss value
        n_samples += images.shape[0]
        cumulative_loss += loss.item()

    return cumulative_loss / n_samples


def test_step_cl(net, data_loader, cost_function, device=device):
    n_samples = 0.0
    cumulative_loss = 0.0

    # set the network to evaluation mode
    net.eval()

    with torch.no_grad():

        for batch_idx, batch in enumerate(tqdm(data_loader, desc="[INFO] Test step")):

            images, texts = list(), list()

            for sample in batch:
                sample = RefCOCOgSample(**sample)

                for sentence in sample.sentences:
                    prep_img = sample.img.crop(sample.bbox)
                    prep_img = clip_prep(prep_img)

                    images.append(prep_img)
                    texts.append(sentence)

            texts = clip.tokenize(texts).to(device)
            images = torch.stack(images).to(device)

            images = images.to(device)
            texts = texts.to(device)

            # forward pass
            image_logits, text_logits = net(images, texts)

            # loss computation
            loss = contrastive_loss(image_logits, text_logits, cost_function)

            # fetch loss value
            n_samples += images.shape[0]
            cumulative_loss += loss.item()

    return cumulative_loss / n_samples


def log_values_cl(writer, step, loss, prefix):
    writer.add_scalar(f"{prefix}/loss", loss, step)


def main_loop_cl(train_ds,
                 val_ds,
                 test_ds,
                 batch_size=batch_size,
                 num_classes=90,  # 90 classes in RefCOCOg
                 device=device,
                 learning_rate=learning_rate,
                 weight_decay=0.000001,
                 momentum=momentum,
                 epochs=epochs,
                 optimizer=optimizer):
    # create a logger for the experiment
    writer = SummaryWriter(log_dir="runs/exp1")

    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)

    # instantiate the network and move it to the chosen device (GPU)
    net = clip_model.to(device)

    # instantiate the optimizer
    optimizer = get_optimizer(net, learning_rate, weight_decay, momentum, optimizer)

    # define the cost function
    cost_function = get_cost_function()

    # computes evaluation results before training
    # TODO: fix error here: CustomCLIP.forward() takes 2 positional arguments but 3 were given
    print('Before training:')
    train_loss = test_step_cl(net, val_loader, cost_function)
    val_loss = test_step_cl(net, val_loader, cost_function)
    test_loss = test_step_cl(net, test_loader, cost_function)

    # print(train_loss)

    # log to TensorBoard
    log_values_cl(writer, -1, train_loss, "train")
    log_values_cl(writer, -1, val_loss, "validation")
    log_values_cl(writer, -1, test_loss, "test")

    print('\tTraining loss {:.5f}'.format(train_loss))
    print('\tValidation loss {:.5f}'.format(val_loss))
    print('\tTest loss {:.5f}'.format(test_loss))
    print('-----------------------------------------------------')

    # for each epoch, train the network and then compute evaluation results
    for e in range(epochs):
        train_loss = training_step_cl(net, train_loader, optimizer, cost_function)
        val_loss = test_step_cl(net, val_loader, cost_function)

        # logs to TensorBoard
        log_values_cl(writer, e, val_loss, "Validation")

        print('Epoch: {:d}'.format(e + 1))
        print('\tTraining loss {:.5f}'.format(train_loss))
        print('\tValidation loss {:.5f}'.format(val_loss))
        print('-----------------------------------------------------')

    # compute final evaluation results
    print('After training:')
    train_loss = test_step_cl(net, train_loader, cost_function)
    val_loss = test_step_cl(net, val_loader, cost_function)
    test_loss = test_step_cl(net, test_loader, cost_function)

    # log to TensorBoard
    log_values_cl(writer, epochs, train_loss, "train")
    log_values_cl(writer, epochs, val_loss, "validation")
    log_values_cl(writer, epochs, test_loss, "test")

    print('\tTraining loss {:.5f}'.format(train_loss))
    print('\tValidation loss {:.5f}'.format(val_loss))
    print('\tTest loss {:.5f}'.format(test_loss))
    print('-----------------------------------------------------')

    # closes the logger
    writer.close()


In [None]:
#@title Execute main training loop

main_loop_cl(train_ds, val_ds, test_ds)


---

# Visual Grounding test

The following is to verify the effect of these approaches to the main visual grounding task.

In [None]:
#@title Function definition to test visual grounding with a given pipeline

def visual_grounding_test(vg_pipeline, dataset):
    scores = list()

    for sample in tqdm(dataset, desc=f"Testing on {len(dataset)} images"):

        sample = RefCOCOgSample(**sample)

        for sentence in sample.sentences:

            try:
                sc = vg_pipeline(sample, sentence, show=False)
            except ValueError:
                continue

            scores.append(sc)

    for metric in scores[0].keys():
        avg_metric = np.mean([score[metric] for score in scores])

        print("Avg. {}: {:.3f}".format(metric, avg_metric))


In [None]:
#@title Import the YoloClip pipeline and test it on the test dataset

from modules.yoloclip import YoloClip

yoloclip = YoloClip(device=device, quiet=True, categories=dataset.dataset.categories)

visual_grounding_test(yoloclip, test_ds)
