In [1]:
import clip
import numpy as np
import torch
from PIL import Image
from torch.utils.data import random_split
from tqdm import tqdm

from refcocog import RefCOCOg, RefCOCOgSample

device = torch.device("cpu")

# if torch.cuda.is_available():
#     device = torch.device("cuda")  # CUDA GPU
#     print("[INFO] GPU found, using GPU.")
# elif torch.has_mps:
#     device = torch.device("mps")  # Apple Silicon
#     print("[INFO] MPS found, using MPS.")
# else:
#     device = torch.device("cpu")
#     print("[INFO] No GPU found, using CPU instead.")

clip_model, prep = clip.load("RN50", device=device)


In [2]:
def training_step(model, dataloader, optimizer, cost_function, device='cuda'):
    model.train()

    loss = 0.0

    for batch in tqdm(dataloader, desc="Performing training step"):

        images, captions = [], []

        for sample in batch:
            sample = RefCOCOgSample(**sample)

            for caption in sample.sentences:
                image = Image.open(sample.path)
                image = image.crop(sample.bbox)
                image = prep(image).to(device)

                images.append(image)
                captions.append(caption)

        captions = clip.tokenize(captions).to(device)
        images = torch.stack(images).to(device)

        image_logits, text_logits = model(images, captions)

        labels = np.arange(images.shape[0])
        labels = torch.from_numpy(labels)

        loss_i = cost_function(image_logits, labels)
        loss_t = cost_function(text_logits, labels)

        loss += (loss_i + loss_t) / 2.0

    loss = loss.mean()

    loss.backward()

    optimizer.step()

    optimizer.zero_grad()

    return loss.item(), 42


In [3]:
def test_step(model, dataloader, cost_function, device='cuda'):
    model.eval()

    loss = 0.0

    with torch.no_grad():

        for batch in tqdm(dataloader, desc="Performing test step"):

            images, captions = [], []

            for sample in batch:
                sample = RefCOCOgSample(**sample)

                for caption in sample.sentences:
                    image = Image.open(sample.path)
                    image = image.crop(sample.bbox)
                    image = prep(image).to(device)

                    images.append(image)
                    captions.append(caption)

            captions = clip.tokenize(captions).to(device)
            images = torch.stack(images).to(device)

            image_logits, text_logits = model(images, captions)

            labels = np.arange(images.shape[0])
            labels = torch.from_numpy(labels)

            loss_i = cost_function(image_logits, labels)
            loss_t = cost_function(text_logits, labels)

            loss += (loss_i + loss_t) / 2.0

    loss = loss.mean()

    return loss.item(), 42


In [4]:
dataset = RefCOCOg(ds_path='dataset/refcocog')

keep = 0.01
train_split = 0.80

# keep only a toy portion of the dataset
dataset, _ = random_split(dataset, [int(keep * len(dataset)), len(dataset) - int(keep * len(dataset))])

train_ds, test_ds = random_split(dataset,
                                 [int(train_split * len(dataset)), len(dataset) - int(train_split * len(dataset))])

print(f"Dataset Size: {len(dataset)}\n---")
print(f"Train size {len(train_ds)}")
print(f"Test Size: {len(test_ds)}")


Dataset Size: 498
---
Train size 398
Test Size: 100


In [None]:
batch_size = 8
epochs = 3

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)

optimizer = torch.optim.Adam(clip_model.parameters(), lr=0.0001, weight_decay=0.000001)

cost_function = torch.nn.CrossEntropyLoss()

# perform a preliminar step
print('Before training:')
train_loss, train_accuracy = test_step(clip_model, train_dl, cost_function, device=device)
test_loss, test_accuracy = test_step(clip_model, test_dl, cost_function, device=device)

print('TRAINING\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(train_loss, train_accuracy))
print('TEST\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(test_loss, test_accuracy))

# range over the number of epochs
for e in range(epochs):
    train_loss, train_accuracy = training_step(clip_model, train_dl, optimizer, cost_function, device=device)
    test_loss, test_accuracy = test_step(clip_model, test_dl, cost_function, device=device)
    print('Epoch: {:d}'.format(e + 1))
    print('TRAINING\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(train_loss, train_accuracy))
    print('TEST\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(test_loss, test_accuracy))

# perform final test step and print the final metrics
print('After training:')
train_loss, train_accuracy = test_step(clip_model, train_dl, cost_function, device=device)
test_loss, test_accuracy = test_step(clip_model, test_dl, cost_function, device=device)

print('TRAINING\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(train_loss, train_accuracy))
print('TEST\n\tLoss {:.5f}\n\tAccuracy {:.2f}'.format(test_loss, test_accuracy))


Before training:


Performing test step:   8%|▊         | 4/50 [00:05<01:01,  1.34s/it]