# Part A: Baseline

In [None]:
%pip install pyquery
%pip install online_triplet_loss
%pip install pytorch-pretrained-bert



In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/cs2770_hw3')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import copy
import skimage.io as io
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import random
import csv
from torchvision import transforms
from PIL import Image
from pycocotools.coco import COCO
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from torchvision import models
from PascalSentenceDataset.pascal_sentence_dataset import PascalSentenceDataSet
from online_triplet_loss.losses import *

In [None]:
device = 'cuda:0'
model = nn.Embedding(50, 50)
model = model.to(device)

In [None]:
class Alexnet_Feature_Extraction(torch.nn.Module):
  def __init__(self):
    super(Alexnet_Feature_Extraction, self).__init__()
    Alexnet_Pretrained = torch.hub.load('pytorch/vision:v0.9.0', 'alexnet', pretrained=True)
    self.features = Alexnet_Pretrained.features
    self.avgpool = Alexnet_Pretrained.avgpool
    self.feature_extractor = nn.Sequential(*[Alexnet_Pretrained.classifier[i] for i in range(6)])

  def forward(self, x):
    x = self.features(x)
    x = self.avgpool(x)
    x = torch.flatten(x, 1)
    x = self.feature_extractor(x)
    return x

In [None]:
alexnet_model = Alexnet_Feature_Extraction()
alexnet_model = alexnet_model.to(device)
alexnet_model.eval()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.9.0


Alexnet_Feature_Extraction(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (feature_extractor): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_f

In [None]:
embeddings_dict = {}
with open('./glove/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], 'float32')
    embeddings_dict[word] = vector

In [None]:
# Get image vector representation from Alexnet
def get_image_vector(image):
  preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])

  input_tensor = preprocess(image)
  input_batch = input_tensor.unsqueeze(0)

  if torch.cuda.is_available():
      input_batch = input_batch.to('cuda')
      alexnet_model.to('cuda')

  with torch.no_grad():
      i_output = alexnet_model(input_batch)

  return i_output

In [None]:
# Get caption vector representation from Glove
def get_caption_vector(sentence):
  score = 0
  for word in sentence:
    if word in embeddings_dict:
      score += embeddings_dict[word]
  return score

In [None]:
num_epochs = 5
best_acc = 0.0
total = 0
top_1_acc = 0
acc = 0
torch.set_grad_enabled(True)

for epoch in range(num_epochs):
  for phase in ['train', 'val']:
    if phase == 'train':
      dataType = 'train2017'
      model.train()

    elif phase == 'val':
      dataType = 'val2017'
      model.eval()

    instances = './annotations_trainval2017/annotations/instances_{}.json'.format(dataType)
    captions = './annotations_trainval2017/annotations/captions_{}.json'.format(dataType)

    coco_instances = COCO(instances)
    coco_captions = COCO(captions)

    imgs = coco_instances.getImgIds()

    # for index in range(0, len(imgs)):
    for index in range(0, 100): # Just did this so it goes faster
      img = coco_instances.loadImgs(imgs[index])[0]
      I = io.imread(img['coco_url'])

      if len(I.shape) < 3:
        continue

      I = Image.fromarray(I)

      annIds = coco_captions.getAnnIds(imgIds=img['id'])
      anns = coco_captions.loadAnns(annIds)

      random_index = index
      while random_index == index:
        random_index = random.randint(0, len(imgs))

      rand_img = coco_instances.loadImgs(imgs[random_index])[0]
      rand_annIds = coco_captions.getAnnIds(imgIds=rand_img['id'])
      rand_anns = coco_captions.loadAnns(rand_annIds)

      a = get_image_vector(I)
      y_p = torch.tensor(get_caption_vector(anns[0]['caption'].split()))
      y_n = torch.tensor(get_caption_vector(rand_anns[0]['caption'].split()))

      y_p = y_p.to(device)
      y_n = y_n.to(device)

      x_a = model(a.to(device).long())

      triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
      loss = triplet_loss(x_a, y_p, y_n)

      p_dist = torch.norm(torch.subtract(x_a, y_p)) ** 2
      n_dist = torch.norm(torch.subtract(x_a, y_n)) ** 2

      if phase == 'train':
        loss.backward()

      elif phase == 'val':
        if p_dist < n_dist:
          top_1_acc += 1

        total += 1
        acc = top_1_acc / total
        
    if acc > best_acc:
      best_acc = acc
      best_model_wts = copy.deepcopy(model.state_dict())
      torch.save(best_model_wts, 'best_model_weight.pth')

print(f'Accuracy: {acc}')

loading annotations into memory...
Done (t=28.73s)
creating index...
index created!
loading annotations into memory...
Done (t=3.39s)
creating index...
index created!
loading annotations into memory...
Done (t=2.28s)
creating index...
index created!
loading annotations into memory...
Done (t=1.53s)
creating index...
index created!
loading annotations into memory...
Done (t=16.17s)
creating index...
index created!
loading annotations into memory...
Done (t=1.05s)
creating index...
index created!
loading annotations into memory...
Done (t=0.49s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
loading annotations into memory...
Done (t=16.43s)
creating index...
index created!
loading annotations into memory...
Done (t=1.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.48s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
l

# Part B: Cross-dataset performance and adaptation

In [None]:
model.load_state_dict(torch.load('best_model_weight.pth'))
model.eval()

Embedding(50, 50)

In [None]:
file_dict = {}
i_path = 'PascalSentenceDataset/dataset/'
c_path = 'PascalSentenceDataset/sentence/'

top_1_acc = 0
total = 0
acc = 0

with open('PascalSentenceDataset/correspondence.csv') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')

  for row in csv_reader:
      if len(row) > 0:
        file_dict[row[0]] = row[1]

for id in file_dict:
  i_file = i_path + file_dict[id]
  p_file = c_path + file_dict[id].replace('.jpg', 'txt')

  rand_id = id
  while rand_id == id:
    rand_id = str(random.randint(1, len(file_dict)))

  n_file = c_path + file_dict[rand_id].replace('.jpg', 'txt')

  I = io.imread(i_file)
  I = Image.fromarray(I)
  a = get_image_vector(I)

  with open(p_file) as f:
    y_p = torch.tensor(get_caption_vector(f.readlines()[0]))
  with open(n_file) as f:
    y_n = torch.tensor(get_caption_vector(f.readlines()[0]))

  y_p = y_p.to(device)
  y_n = y_n.to(device)
  x_a = model(a.to(device).long())

  triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
  loss = triplet_loss(x_a, y_p, y_n)

  p_dist = torch.norm(torch.subtract(x_a, y_p)) ** 2
  n_dist = torch.norm(torch.subtract(x_a, y_n)) ** 2

  if p_dist < n_dist:
    top_1_acc += 1

  total += 1
  acc = top_1_acc / total

print(f'Total accuracy: {acc}')

Total accuracy: 0.495


## Results
* Training on COCO, testing on COCO (above): 0.5
* Training on COCO, testing on PASCAL: .495

# Part C: Representations

Alternate representation #1: Alexnet & BERT

Reported accuracy: 0.473

In [None]:
def get_caption_vector_BERT(caption):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  marked_text = "[CLS] " + caption + " [SEP]"

  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  segments_ids = [1] * len(tokenized_text)

  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  model = BertModel.from_pretrained('bert-base-uncased')

  model.eval()

  score = 0
  with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)[0]

    for word in outputs:
      score += word

    return score

In [None]:
num_epochs = 5
best_acc = 0.0
total = 0
top_1_acc = 0
acc = 0
torch.set_grad_enabled(True)

bert_model = nn.Embedding(768, 768)
bert_model = bert_model.to(device)

for epoch in range(num_epochs):
  for phase in ['train', 'val']:
    if phase == 'train':
      dataType = 'train2017'
      bert_model.train()

    elif phase == 'val':
      dataType = 'val2017'
      bert_model.eval()

    instances = './annotations_trainval2017/annotations/instances_{}.json'.format(dataType)
    captions = './annotations_trainval2017/annotations/captions_{}.json'.format(dataType)

    coco_instances = COCO(instances)
    coco_captions = COCO(captions)

    imgs = coco_instances.getImgIds()

    for index in range(0, len(imgs)):
      img = coco_instances.loadImgs(imgs[index])[0]
      I = io.imread(img['coco_url'])

      if len(I.shape) < 3:
        continue

      I = Image.fromarray(I)

      annIds = coco_captions.getAnnIds(imgIds=img['id'])
      anns = coco_captions.loadAnns(annIds)

      random_index = index
      while random_index == index:
        random_index = random.randint(0, len(imgs))

      rand_img = coco_instances.loadImgs(imgs[random_index])[0]
      rand_annIds = coco_captions.getAnnIds(imgIds=rand_img['id'])
      rand_anns = coco_captions.loadAnns(rand_annIds)

      a = get_image_vector(I)

      y_p = get_caption_vector_BERT(anns[0]['caption'])
      y_n = get_caption_vector_BERT(rand_anns[0]['caption'])

      p_len = y_p.shape[1]
      n_len = y_n.shape[1]

      y_p_t = torch.zeros(1, 4096, 768)
      y_p_t[:, :p_len, :768] = y_p

      y_n_t = torch.zeros(1, 4096, 768)
      y_n_t[:, :n_len, :768] = y_n

      y_p_t = y_p_t.to(device)
      y_n_t = y_n_t.to(device)

      x_a = bert_model(a.to(device).long())

      triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
      print(x_a.shape)
      print(y_p.shape)
      print(y_n.shape)
      loss = triplet_loss(x_a, y_p_t, y_n_t)

      p_dist = torch.norm(torch.subtract(x_a, y_p_t)) ** 2
      n_dist = torch.norm(torch.subtract(x_a, y_n_t)) ** 2

      if phase == 'train':
        loss.backward()

      elif phase == 'val':
        if p_dist < n_dist:
          top_1_acc += 1

        total += 1
        acc = top_1_acc / total
        
    if acc > best_acc:
      best_acc = acc
      # best_model_wts = copy.deepcopy(model.state_dict())
      # torch.save(best_model_wts, 'best_model_weight.pth')

print(f'Accuracy: {acc}')

loading annotations into memory...
Done (t=24.07s)
creating index...
index created!
loading annotations into memory...
Done (t=2.31s)
creating index...
index created!
torch.Size([1, 4096, 768])
torch.Size([1, 18, 768])
torch.Size([1, 18, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 14, 768])
torch.Size([1, 15, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 16, 768])
torch.Size([1, 14, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 13, 768])
torch.Size([1, 14, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 16, 768])
torch.Size([1, 10, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 11, 768])
torch.Size([1, 13, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 17, 768])
torch.Size([1, 13, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 13, 768])
torch.Size([1, 12, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 15, 768])
torch.Size([1, 14, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 11, 768])
torch.Size([1, 16, 768])
torch.Size([1, 4096, 768])
torch.Size([1, 14, 768])
torch.Size(

Alternate representation #2: VGG and Glove

Reported accuracy: 0.532


In [None]:
class VGG16_Feature_Extraction(torch.nn.Module):
  def __init__(self):
    super(VGG16_Feature_Extraction, self).__init__()
    VGG16_Pretrained = models.vgg16(pretrained=True)
    self.features = VGG16_Pretrained.features
    self.avgpool = VGG16_Pretrained.avgpool
    self.feature_extractor = nn.Sequential(*[VGG16_Pretrained.classifier[i] for i in range(6)])

  def forward(self, x):
    x = self.features(x)
    x = self.avgpool(x)
    x = torch.flatten(x, 1)
    x = self.feature_extractor(x)
    return x

In [None]:
vgg_model = VGG16_Feature_Extraction()
device = 'cuda:0'
vgg_model = vgg_model.to(device)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


HBox(children=(FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




In [None]:
def get_image_vector_vgg(image):
  preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])

  input_tensor = preprocess(image)
  input_batch = input_tensor.unsqueeze(0)

  if torch.cuda.is_available():
      input_batch = input_batch.to('cuda')
      vgg_model.to('cuda')

  with torch.no_grad():
      i_output = vgg_model(input_batch)

  return i_output

In [None]:
num_epochs = 5
best_acc = 0.0
total = 0
top_1_acc = 0
acc = 0
torch.set_grad_enabled(True)

for epoch in range(num_epochs):
  for phase in ['train', 'val']:
    if phase == 'train':
      dataType = 'train2017'
      model.train()

    elif phase == 'val':
      dataType = 'val2017'
      model.eval()

    instances = './annotations_trainval2017/annotations/instances_{}.json'.format(dataType)
    captions = './annotations_trainval2017/annotations/captions_{}.json'.format(dataType)

    coco_instances = COCO(instances)
    coco_captions = COCO(captions)

    imgs = coco_instances.getImgIds()

    for index in range(0, len(imgs)):
      img = coco_instances.loadImgs(imgs[index])[0]
      I = io.imread(img['coco_url'])

      if len(I.shape) < 3:
        continue

      I = Image.fromarray(I)

      annIds = coco_captions.getAnnIds(imgIds=img['id'])
      anns = coco_captions.loadAnns(annIds)

      random_index = index
      while random_index == index:
        random_index = random.randint(0, len(imgs))

      rand_img = coco_instances.loadImgs(imgs[random_index])[0]
      rand_annIds = coco_captions.getAnnIds(imgIds=rand_img['id'])
      rand_anns = coco_captions.loadAnns(rand_annIds)

      a = get_image_vector_vgg(I)

      y_p = torch.tensor(get_caption_vector(anns[0]['caption'].split()))
      y_n = torch.tensor(get_caption_vector(rand_anns[0]['caption'].split()))

      y_p = y_p.to(device)
      y_n = y_n.to(device)

      x_a = model(a.to(device).long())

      triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
      loss = triplet_loss(x_a, y_p, y_n)

      p_dist = torch.norm(torch.subtract(x_a, y_p)) ** 2
      n_dist = torch.norm(torch.subtract(x_a, y_n)) ** 2

      if phase == 'train':
        loss.backward()

      elif phase == 'val':
        if p_dist < n_dist:
          top_1_acc += 1

        total += 1
        acc = top_1_acc / total
        
    if acc > best_acc:
      best_acc = acc
      # best_model_wts = copy.deepcopy(model.state_dict())
      # torch.save(best_model_wts, 'best_model_weight.pth')

print(f'Accuracy: {acc}')

loading annotations into memory...
Done (t=18.52s)
creating index...
index created!
loading annotations into memory...
Done (t=1.25s)
creating index...
index created!
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size([50])
torch.Size([1, 4096, 50])
torch.Size([50])
torch.Size