In [1]:
import itertools
from collections import defaultdict
from tqdm import tqdm
import _pickle as cPickle
import os

# decontract, depunctuate
filter_type = "dcp"

# sampling: ["top", "random"]
sampling_type = "random"

# Dumping Paths
que_split_path_dict = {
    "val": ("../../datasets/VQA/back-translate/bt_fil_{}_sampling_{}_v2_OpenEnded_mscoco_val2014_questions.pkl".format(filter_type, sampling_type),
            "../../datasets/VQA/v2_OpenEnded_mscoco_val2014_questions.json"),
    "train": ("../../datasets/VQA/back-translate/bt_fil_{}_sampling_{}_v2_OpenEnded_mscoco_train2014_questions.pkl".format(filter_type, sampling_type),
              "../../datasets/VQA/v2_OpenEnded_mscoco_train2014_questions.json"),
}


ans_split_path_dict = {
    "val": ("../../datasets/VQA/back-translate/bt_fil_{}_sampling_{}_val_target.pkl".format(filter_type, sampling_type),
            "../../datasets/VQA/cache/val_target.pkl"),
    "train": ("../../datasets/VQA/back-translate/bt_fil_{}_sampling_{}_train_target.pkl".format(filter_type, sampling_type),
              "../../datasets/VQA/cache/train_target.pkl"),
}



ans2label_path = os.path.join("../../datasets/VQA/", "cache", "trainval_ans2label.pkl")
label2ans_path = os.path.join("../../datasets/VQA/", "cache", "trainval_label2ans.pkl")
ans2label = cPickle.load(open(ans2label_path, "rb"))
label2ans = cPickle.load(open(label2ans_path, "rb"))
answer_dict = {}
# use later
answers_data = []
for key in ans_split_path_dict.keys():
    path = ans_split_path_dict[key][0]
    answers = cPickle.load(open(path, "rb"))
    answers_data.append(answers)
    answers = list(itertools.chain.from_iterable(answers))
    for ans in answers:
        answer_dict[ans["question_id"]] = ans["labels"]
print("Read Answers")

negs_path = "../../datasets/VQA/back-translate/train_val_question_negs_fil_{}_sampling_{}.pkl" \
    .format(filter_type, sampling_type)
negs_data = cPickle.load(open(negs_path, "rb"))
negs_dict = {}
for qid, sim_scores, sim_qids in zip(negs_data["qids"], negs_data["sim_scores"], negs_data["sim_qids"]):
    negs_dict[qid] = (sim_scores, sim_qids)
print("Read Negatives Matrix")

# create dicts
image_dict = defaultdict(list)
questions_rephrasings = defaultdict(list)
question_dict = {}

# use later
questions_data = []
qids_dict = {}
for split, que_path in que_split_path_dict.items():
    data = cPickle.load(open(que_path[0], "rb"))
    questions_data.append(data)
    questions_list = data["questions"]
    _dict = {}
    
    # add "rephrasing_of" key
    for _questions in questions_list:
        # only keep the min-qid in same-image ids
        min_qid = min([x['question_id'] for x in _questions])
        assert len(set([x['image_id'] for x in _questions])) == 1
        image_dict[_questions[0]["image_id"]].append(min_qid)
        for _que in _questions:
            question_dict[_que["question_id"]] = _que["question"]
            _dict[_que["question_id"]] = None
    qids_dict[split] = _dict
print("Read Questions")



Read Answers
Read Negatives Matrix
Read Questions


In [2]:
save_negs_dict_train = {}
save_negs_dict_val = {}
save_negs_dict_trainval = {}

def filter_negatives(sample):
    # filter same-image questions
    save_dict = {}
    same_image_ids = image_dict[sample["image_id"]]
    fil_same_image_ids = []
    ref_answers = answer_dict[sample["question_id"]]
    for qid in same_image_ids:
        if qid == sample["question_id"]:
            continue
        cand_answers = answer_dict[qid]
        if len(set(ref_answers).intersection(set(cand_answers))) == 0:
            fil_same_image_ids.append(qid)
    save_dict["same_image_questions_neg"] = fil_same_image_ids

    # filter top-k questions
    if sample["question_id"] not in negs_dict:
        return True

    top_k_sim_scores, top_k_questions = negs_dict[sample["question_id"]]
    fil_top_k_questions = []
    fil_top_k_questions_train = []
    fil_top_k_questions_val = []
    
    for qid in top_k_questions:
        cand_answers = answer_dict[qid]
        if len(set(ref_answers).intersection(set(cand_answers))) == 0:
            fil_top_k_questions.append(qid)
            if qid in qids_dict["train"]:
                fil_top_k_questions_train.append(qid)
            elif qid in qids_dict["val"]:
                fil_top_k_questions_val.append(qid)
            else:
                import pdb
                pdb.set_trace()
    
    save_dict["top_k_questions_neg"] = fil_top_k_questions
    save_dict["top_k_questions_neg_train"] = fil_top_k_questions_train
    save_dict["top_k_questions_neg_val"] = fil_top_k_questions_val

    if sample["question_id"] in qids_dict["train"]:
        del save_dict["top_k_questions_neg_val"]
        save_negs_dict_trainval[sample["question_id"]] = save_dict
        save_negs_dict_train[sample["question_id"]] = save_dict
    elif sample["question_id"] in qids_dict["val"]:
        del save_dict["top_k_questions_neg_train"]
        save_negs_dict_trainval[sample["question_id"]] = save_dict
        save_negs_dict_val[sample["question_id"]] = save_dict
    else:
        import pdb
        pdb.set_trace() 
    return False

In [3]:
for que_data, ans_data, que_path, ans_path in zip(questions_data, 
                                                  answers_data, 
                                                  que_split_path_dict.values(), 
                                                  ans_split_path_dict.values()):
    data, answers, que_path, ans_path = que_data,\
                                        ans_data,\
                                        que_path[0],\
                                        ans_path[0]
    
    print("Questions Path: ", que_path)
    print("Answers Path: ", ans_path)
    questions_list = data["questions"]

    # add "rephrasing_of" key
    for _questions in questions_list:
        rep_id = min([s['question_id'] for s in _questions])
        for _que in _questions:
            _que["rephrasing_of"] = rep_id
    
    assert len(questions_list) == len(answers)

    # remove questions w/o negatives
    for idx in tqdm(range(len(questions_list)), total=len(questions_list)):
        _updated_ques = []
        _updated_answers = []
        _questions = questions_list[idx]
        _answers = answers[idx]

        for _que, _ans in zip(_questions, _answers):
            delete = filter_negatives(_que)
            if not delete:
                _updated_ques.append(_que)
                _updated_answers.append(_ans)
            else:
                import pdb
                pdb.set_trace()
        questions_list[idx] = _updated_ques
        answers[idx] = _updated_answers

# This was a hacky filtering procedure for removing unused samples, not needed any further.
#     cPickle.dump(data, open(que_path, "wb"), protocol=2)
#     cPickle.dump(answers, open(ans_path, "wb"), protocol=2)
#     print(f"Dumped: {que_path}")
#     print(f"Dumped: {ans_path}")

Questions Path:  ../../datasets/VQA/back-translate/bt_fil_dcp_sampling_random_v2_OpenEnded_mscoco_val2014_questions.pkl
Answers Path:  ../../datasets/VQA/back-translate/bt_fil_dcp_sampling_random_val_target.pkl


100%|██████████| 214354/214354 [19:32<00:00, 182.75it/s] 


Questions Path:  ../../datasets/VQA/back-translate/bt_fil_dcp_sampling_random_v2_OpenEnded_mscoco_train2014_questions.pkl
Answers Path:  ../../datasets/VQA/back-translate/bt_fil_dcp_sampling_random_train_target.pkl


100%|██████████| 443757/443757 [39:29<00:00, 187.30it/s]  


In [4]:
save_negs_paths = {
   "val": "../../datasets/VQA/back-translate/fil_{}_sampling_{}_val_question_negs.pkl".format(filter_type, sampling_type),
   "train": "../../datasets/VQA/back-translate/fil_{}_sampling_{}_train_question_negs.pkl".format(filter_type, sampling_type),
   "trainval": "../../datasets/VQA/back-translate/fil_{}_sampling_{}_trainval_question_negs.pkl".format(filter_type, sampling_type),               
}

In [5]:
values = list(save_negs_dict_val.values())
lens = [len(x["top_k_questions_neg"]) for x in values]
max(lens)

300

In [6]:
import numpy as np

# not saved "trainval" file
for key in save_negs_paths.keys():
    negs_dict = globals()[f"save_negs_dict_{key}"]
    print(f"Save Path: {save_negs_paths[key]}")

    question_ids = list(negs_dict.keys())
    question_values = list(negs_dict.values())
    question_negs = np.zeros((len(question_ids), 300), dtype=np.int64) + -1
    same_image_questions_neg = np.zeros((len(question_ids), 300), dtype=np.int64) + -1

    for idx in tqdm(range(len(question_negs))):
        if key == "trainval":
            negs_ids = question_values[idx][f"top_k_questions_neg"]
        else:
            neg_ids = question_values[idx][f"top_k_questions_neg_{key}"]
        same_image_ids = question_values[idx]["same_image_questions_neg"]
        question_negs[idx][:len(neg_ids)] = neg_ids
        try:
            same_image_questions_neg[idx][:len(same_image_ids)] = same_image_ids
        except:
            import pdb
            pdb.set_trace()
    
    cPickle.dump({"qids": question_ids, 
                "question_negs": question_negs, 
                "same_image_questions_neg": same_image_questions_neg}, 
         open(save_negs_paths[key], "wb"), protocol=4)

Save Path: ../../datasets/VQA/back-translate/fil_dcp_sampling_random_val_question_negs.pkl


100%|██████████| 810454/810454 [00:18<00:00, 44905.57it/s]


Save Path: ../../datasets/VQA/back-translate/fil_dcp_sampling_random_train_question_negs.pkl


100%|██████████| 1680018/1680018 [00:57<00:00, 29098.97it/s]


Save Path: ../../datasets/VQA/back-translate/fil_dcp_sampling_random_trainval_question_negs.pkl


100%|██████████| 2490472/2490472 [01:12<00:00, 34304.73it/s]
