In [1]:
import pandas as pd
from collections import defaultdict
from copy import deepcopy

import numpy as np
import json
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import _pickle as cPickle


que_split_path_dict = {
    "minval": "../../datasets/VQA/back-translate/org2_bt_v2_OpenEnded_mscoco_minval2014_questions.pkl",
    "train": "../../datasets/VQA/back-translate/org2_bt_v2_OpenEnded_mscoco_train2014_questions.pkl",
    "val":  "../../datasets/VQA/back-translate/org2_bt_v2_OpenEnded_mscoco_val2014_questions.pkl",
    "test": "../../datasets/VQA/back-translate/org2_bt_v2_OpenEnded_mscoco_test2015_questions.pkl",
    "trainval": "../../datasets/VQA/back-translate/org2_bt_v2_OpenEnded_mscoco_trainval2014_questions.pkl",
}

dump_path_dict = {
    "minval": "../../datasets/VQA/back-translate/analyze_org2_bt_v2_OpenEnded_mscoco_minval2014_questions.pkl",
    "train": "../../datasets/VQA/back-translate/analyze_org2_bt_v2_OpenEnded_mscoco_train2014_questions.pkl",
    "val": "../../datasets/VQA/back-translate/analyze_org2_bt_v2_OpenEnded_mscoco_val2014_questions.pkl",
    "test": "../../datasets/VQA/back-translate/analyze_org2_bt_v2_OpenEnded_mscoco_test2015_questions.pkl",
    "trainval": "../../datasets/VQA/back-translate/analyze_org2_bt_v2_OpenEnded_mscoco_trainval2014_questions.pkl",
}

In [18]:
# get all languages
lang_seqs = np.load("lang_seqs.npy", allow_pickle=True)
all_langs = list(np.concatenate(lang_seqs))
df = pd.DataFrame(columns=["org_question_id"] + all_langs)
split = "minval"
print(f"Processing: {split}")
data = cPickle.load(open(que_split_path_dict[split], "rb"))
questions = data["questions"]

for row in tqdm(questions):
    langs = []
    values = [row[0]["question_id"]]
    for idx, item in enumerate(row):
        langs.extend(item["languages"])
        value = [(item["sim_score"], idx+1)] * len(item["languages"])
        values.extend(value)
    _df_row = pd.DataFrame([values], columns=["org_question_id"] + langs)
    df = df.append(_df_row)

last_col = []
col_names = []
for col_name in df.columns:
    if col_name == "org_question_id":
        continue
    col = df[col_name]

    # remove NAN and empty tuples
    col = col[col.notna()]
    col = [item for item in col if item != ()]

    if len(col) == 0:
        continue

    sim_scores, ranks = list(zip(*col))
    avg_sim_score, avg_rank = sum(sim_scores)/len(sim_scores), sum(ranks)/len(ranks)
    last_col.append((avg_sim_score, avg_rank))
    col_names.append(col_name)

last_row = pd.DataFrame([last_col], columns=col_names)
df = df.append(last_row)
# sort based on descending similarity
# filter NAN
filtered_inds = df.iloc[-1][df.iloc[-1].notna()]
# create tuple of language w/ sim-rank
fil_langs = list(filtered_inds.keys())
df = df[fil_langs]
last_row = [x[0] for x in list(df.iloc[-1])]
sorted_inds = np.argsort(last_row)
sorted_langs = np.array(fil_langs)[sorted_inds]
sorted_langs = np.flip(sorted_langs)
df = df.reindex(sorted_langs, axis=1)

dump_path = dump_path_dict[split]
# df.to_pickle(dump_path)
print(df.iloc[-1])
print("-"*80)
# print(f"Dumped: {dump_path}")




Processing: minval


100%|██████████| 3000/3000 [01:12<00:00, 32.80it/s]


en                                   (1.0, 1.0)
nl      (0.9647873496757869, 4.669947275922671)
sv      (0.9643575454016304, 4.868016759776537)
de      (0.9633686107722961, 4.796119929453263)
da      (0.9633143910506267, 4.902309058614565)
                         ...                   
kqn    (0.8520042200254488, 22.455696202531644)
tll    (0.8497816105683644, 23.113793103448277)
lue    (0.8469826591497212, 22.804624277456647)
tiv            (0.8464531257748604, 23.8296875)
kwy    (0.8456927291813865, 23.810699588477366)
Name: 0, Length: 85, dtype: object
--------------------------------------------------------------------------------


In [33]:
langs = df.keys()
sims = [x[0] for x in list(df.iloc[-1])]
ranks = [x[1] for x in list(df.iloc[-1])]

df_sim_plot = pd.DataFrame( [sims], columns=langs)
df_rank_plot = pd.DataFrame( [ranks], columns=langs)

In [34]:
import matplotlib.pyplot as plt
import pandas as pd
df_sim_plot.to_csv("../../datasets/VQA/back-translate/analyze_minval_sim.csv")
df_rank_plot.to_csv("../../datasets/VQA/back-translate/analyze_minval_rank.csv")



In [44]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_sim_plot)


    en        nl        sv        de        da        ga        fr       it  \
0  1.0  0.964787  0.964358  0.963369  0.963314  0.963301  0.961826  0.96041   

         af   ROMANCE        fi       bg        cs        sk        et  \
0  0.958322  0.956075  0.955085  0.95468  0.953235  0.953186  0.952776   

         mk        hu        mt        id        sq        eo        ru  \
0  0.948876  0.947255  0.946773  0.946217  0.945489  0.945208  0.942523   

         tl        ca        uk        is        eu       ilo       ceb  \
0  0.942449  0.939547  0.938884  0.937591  0.931664  0.930426  0.930184   

         gl       hil        sn       bcl        ml        tn        ts  \
0  0.929355  0.916505  0.909547  0.906534  0.905973  0.905406  0.902225   

        nso        st        mg        ee       crs        to        ny  \
0  0.901855  0.899871  0.898715  0.898538  0.896045  0.895879  0.895239   

        pis       bzs        sm        cy       pag       run        ti  \
0  0.894123  