In [3]:
from pycocotools.coco import COCO

In [4]:
dataDir = '/media/alex/home/mscoco'
dataType = 'train2014'
annFile = f'{dataDir}/annotations/instances_{dataType}.json'

In [5]:
coco = COCO(annFile)

loading annotations into memory...
Done (t=8.64s)
creating index...
index created!


In [26]:
cats = coco.loadCats(coco.getCatIds())
nms = [cat['name'] for cat in cats]
print("Category names:", nms)

Category names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [88]:
import numpy as np

def create_subset(count):
    """
    Samples count number of categories without replacement
    and randomly samples images for each of the sampled categories
    """
    distractorCats = np.random.choice(cats, count, replace=False)
    return [np.random.choice(coco.getImgIds(catIds=cat['id'])) for cat in distractorCats]

In [89]:
example_set = create_subset(10)
example_set

[557394, 501540, 174794, 374185, 508071, 42278, 182937, 47093, 10735, 70365]

In [55]:
coco_caps = COCO(f'{dataDir}/annotations/captions_{dataType}.json')

loading annotations into memory...
Done (t=0.76s)
creating index...
index created!


In [63]:
def build_train_set(subset):
    caption_list = list()
    # Retrieve captions for each image and shuffle them
    for img in example_set:
        annIds = coco_caps.getAnnIds(img)
        anns = coco_caps.loadAnns(annIds)
        anns = [ann['caption'] for ann in anns]
        np.random.shuffle(anns)
        caption_list.append(anns)
    
    # Shuffle overall images
    np.random.shuffle(caption_list)
    
    query = caption_list[0][0]
    gold = caption_list[0][1]
    leftovers = caption_list[1:]
    distractors = [np.random.choice(leftover) for leftover in leftovers]
    
    distractors.append(gold)
    np.random.shuffle(distractors)
    
    return query, gold, distractors
build_train_set(example_set)

('a small refrigerator on a city street outdoors',
 'a open empty fridge sitting in a drive way next to a car',
 ['A street sign informing the public to walk their bikes across the street rather than ride them.',
  'a open empty fridge sitting in a drive way next to a car',
  'There is a painting by the fireplace in the living room. ',
  'a baseball player holding a bat standing on home base.',
  'This room is full of clothes and backpacks.',
  'A man in grey shirt removing a slice of pizza from a box.',
  'A woman standing in an open air market surrounded by people',
  'A couple of people on skis on a slope top.',
  'Young girl holding umbrella while wandering through store.',
  'A person wearing headphones and holding a cellular phone in each hand.'])

In [72]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

In [97]:
import operator
example_set = create_subset(10)
query, gold, subset = build_train_set(example_set)
print("Query:", query)
print("Gold:",gold)
print("Set:", subset)

dist = [(caption, levenshtein(query, caption)) for caption in subset]
best_caption = min(dist, key=operator.itemgetter(1))

print("Best caption:", best_caption)
if best_caption[0] == gold:
    print("Found!")
else:
    print("Wrong found!")

Query: A group of people are skiing and snowboarding down a mountain.
Gold: a group of people that are skiing on a hill side
Set: ['A group of people playing in the grass.', 'A woman is riding a horse as an audience watches.', 'People are climbing onto two small covered boats.', 'A variety of items that have been taken out of a purse.', 'Two parking meters sit on the side walk by the cars', 'A cow nuzzles a new baby calf in a barn.', 'A street with a blue bus stopped at a bus stop.', 'The sheep are gathered under the canopy of the tree.', 'A woman on her back smiles at her friend', 'a group of people that are skiing on a hill side']
Best caption: ('a group of people that are skiing on a hill side', 31.0)
Found!


In [102]:
acc = 0
for i in range(100):
    example_set = create_subset(10)
    query, gold, subset = build_train_set(example_set)
    dist = [(caption, levenshtein(query, caption)) for caption in subset]
    best_caption = min(dist, key=operator.itemgetter(1))

    if best_caption[0] == gold:
        acc += 1
print(acc / 100)

0.42
