# Image Captioning through QuD

We first transform the dataset in a way that we can know if this project can be successful or not.

We merge MSCOCO and VQA:

```
{“Question_1”: {“cat”: [(Image_1, Caption_1), (Image_2, Caption_2)], “giraffe”: [(Image_3, Caption_3), (Image_4, Caption_4)], …}}
```

In [4]:
MSCOCO_ANNO_PATH = "/home/anie/PragmaticVQA/data/vqa/vqa2_raw/mscoco/annotations/"

import json

def load_mscoco_captions(path):
#     with open(path+"captions_train2014.json") as f:
#         train_captions = json.load(f)
    with open(path+"captions_val2014.json") as f:
        val_captions = json.load(f)
    return val_captions

In [5]:
val_captions = load_mscoco_captions(MSCOCO_ANNO_PATH)

In [7]:
len(val_captions['annotations'])

202654

In [8]:
val_captions['annotations'][:5]

[{'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 {'image_id': 179765,
  'id': 38,
  'caption': 'A black Honda motorcycle parked in front of a garage.'},
 {'image_id': 322141,
  'id': 49,
  'caption': 'A room with blue walls and a white sink and door.'},
 {'image_id': 16977,
  'id': 89,
  'caption': 'A car that seems to be parked illegally behind a legally parked car'},
 {'image_id': 106140,
  'id': 98,
  'caption': 'A large passenger airplane flying through the air.'}]

In [11]:
image_ids = set()
for a in val_captions['annotations']:
    image_ids.add(a['image_id'])
len(image_ids)

40504

In [13]:
from collections import defaultdict
image_id_to_cap = defaultdict(list)
for a in val_captions['annotations']:
    image_id_to_cap[a['image_id']].append(a['caption'])

In [14]:
list(image_id_to_cap.items())[:5]

[(203564,
  ['A bicycle replica with a clock as the front wheel.',
   'The bike has a clock as a tire.',
   'A black metal bicycle with a clock inside the front wheel.',
   'A bicycle figurine in which the front wheel is replaced with a clock\n',
   'A clock with the appearance of the wheel of a bicycle ']),
 (179765,
  ['A black Honda motorcycle parked in front of a garage.',
   'A Honda motorcycle parked in a grass driveway',
   'A black Honda motorcycle with a dark burgundy seat.',
   'Ma motorcycle parked on the gravel in front of a garage',
   'A motorcycle with its brake extended standing outside']),
 (322141,
  ['A room with blue walls and a white sink and door.',
   'Blue and white color scheme in a small bathroom.',
   'This is a blue and white bathroom with a wall sink and a lifesaver on the wall.',
   'A blue boat themed bathroom with a life preserver on the wall',
   'A bathroom with walls that are painted baby blue.']),
 (16977,
  ['A car that seems to be parked illegally 

In [9]:
val_captions['images'][:5]

[{'license': 3,
  'file_name': 'COCO_val2014_000000391895.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
  'height': 360,
  'width': 640,
  'date_captured': '2013-11-14 11:18:45',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'id': 391895},
 {'license': 4,
  'file_name': 'COCO_val2014_000000522418.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000522418.jpg',
  'height': 480,
  'width': 640,
  'date_captured': '2013-11-14 11:38:44',
  'flickr_url': 'http://farm1.staticflickr.com/1/127244861_ab0c0381e7_z.jpg',
  'id': 522418},
 {'license': 3,
  'file_name': 'COCO_val2014_000000184613.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000184613.jpg',
  'height': 336,
  'width': 500,
  'date_captured': '2013-11-14 12:36:29',
  'flickr_url': 'http://farm3.staticflickr.com/2169/2118578392_1193aa04a0_z.jpg',
  'id': 184613},
 {'license': 3,
  'file_name': 'COC

In [15]:
import json
from tqdm import tqdm
from collections import defaultdict

def load_annotations(root='./data/vqa/vqa2_raw/'):
    """Annotations come from a specific format of:
    {'question_type': first few words of the question, 'multiple_choice_answer': ground truth answer, 'answers': we can ignore this,
    'image_id': image associated, 'answer_type':  "yes/no", "number", and "other",
    'question_id': number}

    This file actually does not contain real questions
    
    Keyword Arguments:
        root {str} -- root path of the data folder (default: {'./data/vqa/vqa2_raw/'})

    Return:
        {question_id: {'question_type': 'none of the above',
            'multiple_choice_answer': 'down',
            'image_id': 262148,
            'answer_type': 'other',
            'question_id': 262148000}}
    """
    anno_train_path = "annotations/v2_mscoco_train2014_annotations.json"
    anno_val_path = "annotations/v2_mscoco_val2014_annotations.json"
    
    train_anno_data = {}
    val_anno_data = {}
    for item in json.load(open(root+anno_train_path))['annotations']:
        del item['answers']
        train_anno_data[item['question_id']] = item

    for item in json.load(open(root+anno_val_path))['annotations']:
        del item['answers']
        val_anno_data[item['question_id']] = item

    return train_anno_data, val_anno_data

def load_questions(root='./data/vqa/vqa2_raw/'):
    """This file contains questions but not answers (answers are from annotations)
    We are also only dealing with MSCOCO instead of the 
    
    Keyword Arguments:
        root {str} -- root path (default: {'./data/vqa/vqa2_raw/'})

    Return:
        {question_id: {'image_id': 262148,
            'question': 'What is he on top of?',
            'question_id': 262148002}}
    """
    question_val_path = "questions/v2_OpenEnded_mscoco_val2014_questions.json"
    question_train_path = "questions/v2_OpenEnded_mscoco_train2014_questions.json"
    
    train_questions = {}
    val_questions = {}
    for item in json.load(open(root+question_train_path))['questions']:
        train_questions[item['question_id']] = item 

    for item in json.load(open(root+question_val_path))['questions']:
        val_questions[item['question_id']] = item 

    return train_questions, val_questions

In [16]:
train_anno_data, val_anno_data = load_annotations()
train_questions, val_questions = load_questions()

In [17]:
# {'Question1': {'answer1': [image_id, image_id,...]}}

In [23]:
val_data = {}

for q_id, question in val_questions.items():
    q = question['question']
    a = val_anno_data[q_id]['multiple_choice_answer']
    image_id = val_anno_data[q_id]['image_id']
    if q not in val_data:
        val_data[q] = defaultdict(list)
        val_data[q][a].append(image_id)
    else:
        val_data[q][a].append(image_id)

In [24]:
len(val_data)

81565

In [25]:
val_data

{'Where is he looking?': defaultdict(list,
             {'down': [262148, 208174], 'at camera': [43957, 388531]}),
 'What are the people in the background doing?': defaultdict(list,
             {'watching': [262148, 354174],
              'talking': [464857],
              'skateboarding': [209757],
              'taking picture': [213978],
              'nothing': [247234]}),
 'What is he on top of?': defaultdict(list,
             {'picnic table': [262148], 'chair': [477805]}),
 'What website copyrighted the picture?': defaultdict(list,
             {'foodiebakercom': [393225]}),
 'Is this a creamy soup?': defaultdict(list,
             {'no': [393225], 'yes': [484069]}),
 'Is this rice noodle soup?': defaultdict(list,
             {'yes': [393225], 'no': [92107]}),
 'What is to the right of the soup?': defaultdict(list,
             {'chopsticks': [393225], 'spoon': [327567]}),
 'What is the man doing in the street?': defaultdict(list,
             {'walking': [393226, 69366],
    

In [26]:
train_qud_data = json.load(open("./data/vqa/train_qud_data.json"))

In [27]:
train_image_id_to_cap = json.load(open("./data/vqa/train_image_id_to_cap.json"))

In [37]:
def display_qud_captions(qud_data, image_id_to_cap, ranges):
    # we default to 1st caption for display
    
    qud_data_list = list(qud_data.items())
    for idx in ranges:
        ex = qud_data_list[idx]
        print("Q: ", ex[0])
        for k, vs in ex[1].items():
            print("Answer: ", k)
            for v in vs:
                print("Image ID", v, "Caption: ", image_id_to_cap[str(v)][0])
        print()

In [None]:
display_qud_captions(train_qud_data, train_image_id_to_cap, range(10))