In [1]:
import itertools
from collections import defaultdict
from tqdm import tqdm
import _pickle as cPickle
import os

# Dumping Paths
que_split_path_dict = {
    "train": ("../../datasets/VQA/cc-re/cc_v2_OpenEnded_mscoco_train2014_questions_88_split_{}.pkl",
              "../../datasets/VQA/v2_OpenEnded_mscoco_train2014_questions.json",
              "../../datasets/VQA/cc-re/train_cc_embeddings_88_split_{}.npy"),

    "val": ("../../datasets/VQA/cc-re/cc_v2_OpenEnded_mscoco_val2014_questions_88_split_{}.pkl",
            "../../datasets/VQA/v2_OpenEnded_mscoco_val2014_questions.json",
            "../../datasets/VQA/cc-re/val_cc_embeddings_88_split_{}.npy"),

}


ans_split_path_dict = {
    "train": ("../../datasets/VQA/cc-re/cc_train_target_88_split_{}.pkl",
              "../../datasets/VQA/cache/train_target.pkl"),
    "val": ("../../datasets/VQA/cc-re/cc_val_target_88_split_{}.pkl",
              "../../datasets/VQA/cache/val_target.pkl"),
}


ans2label_path = os.path.join("../../datasets/VQA/", "cache", "trainval_ans2label.pkl")
label2ans_path = os.path.join("../../datasets/VQA/", "cache", "trainval_label2ans.pkl")
ans2label = cPickle.load(open(ans2label_path, "rb"))
label2ans = cPickle.load(open(label2ans_path, "rb"))
answer_dict = {}
# use later
answers_data = []
for key in ans_split_path_dict.keys():
    answers = []
    path_holder = ans_split_path_dict[key][0]
    qpath_holder = que_split_path_dict[key][0]

    for i in range(4):
        path = path_holder.format(i)
        qpath = qpath_holder.format(i)
        
        split_answers = cPickle.load(open(path, "rb"))
        split_questions = cPickle.load(open(qpath, "rb"))
        
        for alist, qlist in zip(split_answers, split_questions):
            for a,q in zip(alist, qlist):
                a["question_id"] = q["question_id"]
        
        cPickle.dump(split_answers, open(path, "wb"))
        answers.extend(split_answers)
    answers_data.append(answers)
    answers = list(itertools.chain.from_iterable(answers))
    for ans in answers:
        answer_dict[ans["question_id"]] = ans["labels"]
print("Read Answers")

negs_path = "../../datasets/VQA/cc-re/trainval_question_negs_88.pkl"
negs_data = cPickle.load(open(negs_path, "rb"))
negs_dict = {}
for qid, sim_scores, sim_qids in zip(negs_data["qids"], negs_data["sim_scores"], negs_data["sim_qids"]):
    negs_dict[qid] = (sim_scores, sim_qids)
print("Read Negatives Matrix")

# create dicts
image_dict = defaultdict(list)
questions_rephrasings = defaultdict(list)
question_dict = {}

# use later
questions_data = []
qids_dict = {}
for key in que_split_path_dict.keys():
    path_holder = que_split_path_dict[key][0]
    questions_list = []
    for i in range(4):
        path = path_holder.format(i)
        split_questions = cPickle.load(open(path, "rb"))
        questions_list.extend(split_questions)

    questions_data.append(questions_list)

    _dict = {}
    # add "rephrasing_of" key
    for _questions in questions_list:
        # only keep the min-qid in same-image ids
        min_qid = min([x['question_id'] for x in _questions])
        assert len(set([x['image_id'] for x in _questions])) == 1
        image_dict[_questions[0]["image_id"]].append(min_qid)
        for _que in _questions:
            question_dict[_que["question_id"]] = _que["question"]
            _dict[_que["question_id"]] = None
    qids_dict[key] = _dict
print("Read Questions")



Read Answers
Read Negatives Matrix
Read Questions


In [2]:
save_negs_dict_train = {}
save_negs_dict_val = {}
save_negs_dict_trainval = {}

def filter_negatives(sample):
    # filter same-image questions
    save_dict = {}
    same_image_ids = image_dict[sample["image_id"]]
    fil_same_image_ids = []
    ref_answers = answer_dict[sample["question_id"]]
    for qid in same_image_ids:
        if qid == sample["question_id"]:
            continue
        cand_answers = answer_dict[qid]
        if len(set(ref_answers).intersection(set(cand_answers))) == 0:
            fil_same_image_ids.append(qid)
    save_dict["same_image_questions_neg"] = fil_same_image_ids

    # filter top-k questions
    if sample["question_id"] not in negs_dict:
        return True

    top_k_sim_scores, top_k_questions = negs_dict[sample["question_id"]]
    fil_top_k_questions = []
    fil_top_k_questions_train = []
    fil_top_k_questions_val = []
    
    for qid in top_k_questions:
        cand_answers = answer_dict[qid]
        if len(set(ref_answers).intersection(set(cand_answers))) == 0:
            fil_top_k_questions.append(qid)
            if qid in qids_dict["train"]:
                fil_top_k_questions_train.append(qid)
            elif qid in qids_dict["val"]:
                fil_top_k_questions_val.append(qid)
            else:
                import pdb
                pdb.set_trace()
    
    save_dict["top_k_questions_neg"] = fil_top_k_questions
    save_dict["top_k_questions_neg_train"] = fil_top_k_questions_train
    save_dict["top_k_questions_neg_val"] = fil_top_k_questions_val

    if sample["question_id"] in qids_dict["train"]:
        del save_dict["top_k_questions_neg_val"]
        save_negs_dict_trainval[sample["question_id"]] = save_dict
        save_negs_dict_train[sample["question_id"]] = save_dict
    elif sample["question_id"] in qids_dict["val"]:
        del save_dict["top_k_questions_neg_train"]
        save_negs_dict_trainval[sample["question_id"]] = save_dict
        save_negs_dict_val[sample["question_id"]] = save_dict
    else:
        import pdb
        pdb.set_trace() 
    return False

In [3]:
for que_data, ans_data, que_path, ans_path in zip(questions_data, 
                                                  answers_data, 
                                                  que_split_path_dict.values(), 
                                                  ans_split_path_dict.values()):
    questions_list, answers, que_path, ans_path = que_data,\
                                        ans_data,\
                                        que_path[0],\
                                        ans_path[0]
    
    print("Questions Path: ", que_path)
    print("Answers Path: ", ans_path)
    # add "rephrasing_of" key
    for _questions in questions_list:
        rep_id = min([s['question_id'] for s in _questions])
        for _que in _questions:
            _que["rephrasing_of"] = rep_id
    
    assert len(questions_list) == len(answers)

    # remove questions w/o negatives
    for idx in tqdm(range(len(questions_list)), total=len(questions_list)):
        _updated_ques = []
        _updated_answers = []
        _questions = questions_list[idx]
        _answers = answers[idx]

        for _que, _ans in zip(_questions, _answers):
            delete = filter_negatives(_que)
            if not delete:
                _updated_ques.append(_que)
                _updated_answers.append(_ans)
            else:
                import pdb
                pdb.set_trace()
        questions_list[idx] = _updated_ques
        answers[idx] = _updated_answers

# This was a hacky filtering procedure for removing unused samples, not needed any further.
#     cPickle.dump(data, open(que_path, "wb"), protocol=2)
#     cPickle.dump(answers, open(ans_path, "wb"), protocol=2)
#     print(f"Dumped: {que_path}")
#     print(f"Dumped: {ans_path}")

Questions Path:  ../../datasets/VQA/cc-re/cc_v2_OpenEnded_mscoco_train2014_questions_88_split_{}.pkl
Answers Path:  ../../datasets/VQA/cc-re/cc_train_target_88_split_{}.pkl


100%|██████████| 443757/443757 [12:45<00:00, 579.41it/s] 


Questions Path:  ../../datasets/VQA/cc-re/cc_v2_OpenEnded_mscoco_val2014_questions_88_split_{}.pkl
Answers Path:  ../../datasets/VQA/cc-re/cc_val_target_88_split_{}.pkl


100%|██████████| 214354/214354 [06:39<00:00, 536.66it/s] 


In [4]:
save_negs_paths = {
   "val": "../../datasets/VQA/cc-re/cc_re_val_question_negs.pkl",
   "train": "../../datasets/VQA/cc-re/cc_re_train_question_negs.pkl",
   "trainval": "../../datasets/VQA/cc-re/cc_re_trainval_question_negs.pkl",               
}

In [5]:
values = list(save_negs_dict_val.values())
lens = [len(x["top_k_questions_neg"]) for x in values]
max(lens)

300

In [36]:
import numpy as np

# not saved "trainval" file
for key in save_negs_paths.keys():
    negs_dict = globals()[f"save_negs_dict_{key}"]
    print(f"Save Path: {save_negs_paths[key]}")

    question_ids = list(negs_dict.keys())
    question_values = list(negs_dict.values())
    question_negs = np.zeros((len(question_ids), 300), dtype=np.int64) + -1
    same_image_questions_neg = np.zeros((len(question_ids), 300), dtype=np.int64) + -1
    
    import pdb
    pdb.set_trace()
    
    for idx in tqdm(range(len(question_negs))):
        if key == "trainval":
            negs_ids = question_values[idx][f"top_k_questions_neg"]
        else:
            neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
        same_image_ids = question_values[idx]["same_image_questions_neg"]
        question_negs[idx][:len(neg_ids)] = neg_ids
        try:
            same_image_questions_neg[idx][:len(same_image_ids)] = same_image_ids
        except:
            import pdb
            pdb.set_trace()
    
#     cPickle.dump({"qids": question_ids, 
#                 "question_negs": question_negs, 
#                 "same_image_questions_neg": same_image_questions_neg}, 
#          open(save_negs_paths[key], "wb"), protocol=4)

Save Path: ../../datasets/VQA/cc-re/cc_re_val_question_negs.pkl
> <ipython-input-36-9146fd9586ac>(16)<module>()
-> for idx in tqdm(range(len(question_negs))):
(Pdb) n


  0%|          | 0/445214 [00:00<?, ?it/s]

> <ipython-input-36-9146fd9586ac>(17)<module>()
-> if key == "trainval":
(Pdb) n
> <ipython-input-36-9146fd9586ac>(20)<module>()
-> neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
(Pdb) n
> <ipython-input-36-9146fd9586ac>(21)<module>()
-> same_image_ids = question_values[idx]["same_image_questions_neg"]
(Pdb) n
> <ipython-input-36-9146fd9586ac>(22)<module>()
-> question_negs[idx][:len(neg_ids)] = neg_ids
(Pdb) neg_ids
[158118002, 156236006, 533550005, 388531008, 166344001, 9483014, 43957000, 562243002, 196836004, 218294001, 418535000, 424130020000000000, 417023009, 3737130160000000000, 505788002, 42413002, 160652004, 3807060000000000000, 302882003, 204726006, 3737130000000000000, 19712005, 1505380030000000000, 447553033, 373713000, 150538003, 122549003, 32464000, 504900003, 4778050190000000000, 405740015, 4057400150000000000, 477805019, 70791002, 122476002, 2047260060000000000, 5493270100000000000, 32609001, 1663440010000000000, 4634540030000000002, 4634540030000000001, 46

  0%|          | 1/445214 [00:11<1401:59:41, 11.34s/it]

> <ipython-input-36-9146fd9586ac>(17)<module>()
-> if key == "trainval":
(Pdb) 
> <ipython-input-36-9146fd9586ac>(20)<module>()
-> neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
(Pdb) 
> <ipython-input-36-9146fd9586ac>(21)<module>()
-> same_image_ids = question_values[idx]["same_image_questions_neg"]
(Pdb) 
> <ipython-input-36-9146fd9586ac>(22)<module>()
-> question_negs[idx][:len(neg_ids)] = neg_ids
(Pdb) 
> <ipython-input-36-9146fd9586ac>(23)<module>()
-> try:
(Pdb) 
> <ipython-input-36-9146fd9586ac>(24)<module>()
-> same_image_questions_neg[idx][:len(same_image_ids)] = same_image_ids
(Pdb) 
> <ipython-input-36-9146fd9586ac>(16)<module>()
-> for idx in tqdm(range(len(question_negs))):
(Pdb) 


  0%|          | 2/445214 [00:15<1130:17:00,  9.14s/it]

> <ipython-input-36-9146fd9586ac>(17)<module>()
-> if key == "trainval":
(Pdb) 
> <ipython-input-36-9146fd9586ac>(20)<module>()
-> neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
(Pdb) 
> <ipython-input-36-9146fd9586ac>(21)<module>()
-> same_image_ids = question_values[idx]["same_image_questions_neg"]
(Pdb) 
> <ipython-input-36-9146fd9586ac>(22)<module>()
-> question_negs[idx][:len(neg_ids)] = neg_ids
(Pdb) 
> <ipython-input-36-9146fd9586ac>(23)<module>()
-> try:
(Pdb) 
> <ipython-input-36-9146fd9586ac>(24)<module>()
-> same_image_questions_neg[idx][:len(same_image_ids)] = same_image_ids
(Pdb) 
> <ipython-input-36-9146fd9586ac>(16)<module>()
-> for idx in tqdm(range(len(question_negs))):
(Pdb) 


  0%|          | 3/445214 [00:16<844:15:47,  6.83s/it] 

> <ipython-input-36-9146fd9586ac>(17)<module>()
-> if key == "trainval":
(Pdb) 
> <ipython-input-36-9146fd9586ac>(20)<module>()
-> neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
(Pdb) 
> <ipython-input-36-9146fd9586ac>(21)<module>()
-> same_image_ids = question_values[idx]["same_image_questions_neg"]
(Pdb) question_negs[0]
array([          158118002,           156236006,           533550005,
                 388531008,           166344001,             9483014,
                  43957000,           562243002,           196836004,
                 218294001,           418535000,  424130020000000000,
                 417023009, 3737130160000000000,           505788002,
                  42413002,           160652004, 3807060000000000000,
                 302882003,           204726006, 3737130000000000000,
                  19712005, 1505380030000000000,           447553033,
                 373713000,           150538003,           122549003,
                  32464000,    

BdbQuit: 

In [58]:
idx = 2
print(f"Original: {question_dict[question_ids[idx]]} \n -----------------")
nids = question_negs[idx]
for _id in nids:
    if _id > 0:
        print(_id)
        print(question_dict[_id])

Original: What is he on top of? 
 -----------------
198782000
What is on top of his head?
566518000
How high is the man?
576248001
How high is the man?
219622000
How high off the ground is the man?
572477002
What are the people on top of?
508101018
What are the people on top of?
458750004
What are the people on top of?
153543010
What are the people on top of?
477805026
What is he on top of?
34299000
What are they sitting atop?
343076001
What is the animal on top of?
538122000
How high is he?
434576007
What is the animal on top of?
222370062
What is the man on top of?
552947004
What is on top of his head?
564163002
What is the man sitting on top of?
44579000
What is on top of his head?
125208002
What is the man sitting on top of?
219426001
What is the man on top of?
472860002
How high off the ground is the man?
1527320000000000001
What is the object on the top of the top the?
353595003
What is on top of his head?
465489024
What is the man standing on top of?
447242004
What surface is he

In [54]:
question_negs[0][:10]

array([158118002, 156236006, 533550005, 388531008, 166344001,   9483014,
        43957000, 562243002, 196836004, 218294001])

In [35]:
question_negs[3000]

array([5789720000000000000, 4478830030000000000, 4607030020000000000,
       5258070030000000000, 3308810200000000000,   46940000000000000,
        564350320000000000,           139907014,           374326010,
                 513138012, 1608660040000000000,           193047009,
                 525807003,           207620021,  635710010000000002,
       3159940140000000000,           315994014, 3308810200000000001,
                 447883003, 4478830030000000001,           573430003,
                 129068001, 5000770030000000001,           573749022,
       1354750020000000000,           367518000, 1512890040000000001,
                   4694000, 3875170180000000001, 1239211330000000001,
       1239211330000000000, 3875170180000000000,           123921133,
                 145911004, 5789720000000000001,            75521028,
                 268195000, 5040300000000000001, 5725750000000000000,
                 484476004,           578972000, 4761700090000000001,
       3260980020000