In [1]:
from pycocotools.coco import COCO

In [2]:
dataDir = '/media/alex/home/mscoco'
dataType = 'train2014'
annFile = f'{dataDir}/annotations/instances_{dataType}.json'

In [3]:
coco = COCO(annFile)

loading annotations into memory...
Done (t=8.11s)
creating index...
index created!


In [4]:
cats = coco.loadCats(coco.getCatIds())
nms = [cat['name'] for cat in cats]
print("Category names:", nms)

Category names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [5]:
import numpy as np

def create_subset(count):
    """
    Samples count number of categories without replacement
    and randomly samples images for each of the sampled categories
    """
    distractorCats = np.random.choice(cats, count, replace=False)
    return [np.random.choice(coco.getImgIds(catIds=cat['id'])) for cat in distractorCats]

In [6]:
example_set = create_subset(10)
example_set

[283818, 489497, 117961, 577948, 563898, 37779, 288694, 188858, 341902, 368874]

In [7]:
coco_caps = COCO(f'{dataDir}/annotations/captions_{dataType}.json')

loading annotations into memory...
Done (t=0.77s)
creating index...
index created!


In [8]:
def build_train_set(subset):
    caption_list = list()
    # Retrieve captions for each image and shuffle them
    for img in example_set:
        annIds = coco_caps.getAnnIds(img)
        anns = coco_caps.loadAnns(annIds)
        anns = [ann['caption'] for ann in anns]
        np.random.shuffle(anns)
        caption_list.append(anns)
    
    # Shuffle overall images
    np.random.shuffle(caption_list)
    
    query = caption_list[0][0]
    gold = caption_list[0][1]
    leftovers = caption_list[1:]
    distractors = [np.random.choice(leftover) for leftover in leftovers]
    
    distractors.append(gold)
    np.random.shuffle(distractors)
    
    return query, gold, distractors
build_train_set(example_set)

('A young boy putting many olives onto a homemade pizza',
 'a little boy is arranging things on top of a pizza',
 ['a little boy is arranging things on top of a pizza',
  'A view of an overpass that goes over a full parking lot.',
  'A guy standing next to a woman during a meeting.',
  'A red stop sign sitting under two green street signs.',
  'A cat is sitting on top of the refrigerator.',
  'a close up of a plate of food with broccoli',
  'A monopoly game is on the clean kitchen counter.',
  'A boy holding a baseball bat on a field.',
  'View of open toaster oven, refrigerator, and shelves with food.',
  'a few people riding motorcycles on a raod'])

In [9]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [10]:
import operator
example_set = create_subset(10)
query, gold, subset = build_train_set(example_set)
print("Query:", query)
print("Gold:",gold)
print("Set:", subset)

dist = [(caption, levenshtein(query, caption)) for caption in subset]
best_caption = min(dist, key=operator.itemgetter(1))

print("Best caption:", best_caption)
if best_caption[0] == gold:
    print("Found!")
else:
    print("Wrong found!")

Query: A dog sits in a yard with a basketball and a Frisbee. 
Gold: A golden retriever sitting in a backyard on grass with toys.
Set: ['The table has four chairs and food on the top.', 'a close up of a pair of scissors near some material ', 'Pizza in bed is always fun but a bit dangerous for the linen.', 'A man riding a wave on top of a surfboard.', 'Several vegetables and fruits on a table and a mason jar.', 'A golden retriever sitting in a backyard on grass with toys.', 'A horse stands with his head resting over a fence.', 'A wooden table with a plate of glazed donuts sitting on top of it.', 'A microwave on a wooden shelf in a kitchen.', 'Plates of dinner are being prepared with sausages, rice, and greens.']
Best caption: ('A microwave on a wooden shelf in a kitchen.', 36.0)
Wrong found!


In [11]:
acc = 0
for i in range(100):
    example_set = create_subset(10)
    query, gold, subset = build_train_set(example_set)
    dist = [(caption, levenshtein(query, caption)) for caption in subset]
    best_caption = min(dist, key=operator.itemgetter(1))

    if best_caption[0] == gold:
        acc += 1
print(acc / 100)

0.43


In [12]:
acc = 0
for i in range(1000):
    example_set = create_subset(10)
    query, gold, subset = build_train_set(example_set)
    dist = [(caption, levenshtein(query, caption)) for caption in subset]
    best_caption = min(dist, key=operator.itemgetter(1))

    if best_caption[0] == gold:
        acc += 1
print(acc / 1000)

0.425
