# Create Table `table-known-item-qpp-effectiveness`

In [1]:
import pandas as pd
from copy import deepcopy
import json
import numpy as np


DATA_DIR = '/mnt/ceph/storage/data-in-progress/data-research/web-search/false-memories/reddit-tomt/tomt-dataset-26-01-2023/qpp-sample/'

qid_to_prediction = [json.loads(i) for i in open(DATA_DIR + '/qpptk-queries-predictions.jsonl', 'r')]
qid_to_prediction = {i['qid']: i for i in qid_to_prediction}

pre_retrieval_predictors = ['max-idf', 'avg-idf', 'scq', 'max-scq', 'avg-scq', 'var', 'max-var', 'avg-var']
test_sets = {'music': [], 'game': [], 'movie': [], 'book': []}

for test_set in list(test_sets.keys()):
    test_sets[test_set] = [json.loads(i) for i in open(f'{DATA_DIR}/{test_set}-test.jsonl', 'r')]
    test_sets['all-categories'] = ([] if 'all-categories' not in test_sets else test_sets['all-categories']) + test_sets[test_set][:25]
    test_sets[test_set] = {i['qid']: i for i in test_sets[test_set]}

test_sets['all-categories'] = {i['qid']: i for i in test_sets['all-categories']}


In [2]:
def get_corr(df, field):
    corrs = {}
    for method in ['pearson', 'kendall', 'spearman']:
        ret = df[['time_to_solve', field]].corr(method=method).iloc[0]
        if ret.name != 'time_to_solve':
            raise ValueError('Df does not have expected shape')
        corrs[method] = float(ret.to_dict()[field])
    
    return corrs

def report_for_test_set(data_type):
    df = []
    for qid in test_sets[data_type].keys():
        if str(qid) not in qid_to_prediction:
            continue
        
        entry = deepcopy(test_sets[data_type][qid])
        for k, v in qid_to_prediction[str(qid)].items():
            if k not in entry:
                entry[k] = v

        if 'time_to_solve' not in entry or entry['time_to_solve'] is None:
            entry['time_to_solve'] = 9999999999
        
        df += [entry]
    
    return pd.DataFrame(df)


df = report_for_test_set('music')
df

Unnamed: 0,text,time_to_solve,qid,max-idf,avg-idf,scq,max-scq,avg-scq,var,max-var,avg-var
0,90's (2000?) music video on a beach/pool I can...,9999999999,1,12.078296,4.884315,993.816316,54.393068,36.808012,49.878323,4.301096,1.108407
1,Alternative / Alt Rock set in 40s or 50s (Grea...,9999999999,2,12.483761,4.889403,1350.791585,52.371773,35.547147,66.634028,4.252976,0.748697
2,Song with youth in either the title or band na...,9999999999,3,12.078296,5.664119,1142.117466,52.883936,36.842499,66.740376,7.893236,1.259252
3,Just recently heard this song in a mall. This ...,9999999999,4,13.176908,5.736310,1149.329142,54.696734,35.916536,66.267754,7.349427,1.142547
4,Cant find a song. So I realize this is probabl...,3105,5,12.483761,4.115772,1772.171104,54.393068,32.817983,94.980544,7.349427,0.896043
...,...,...,...,...,...,...,...,...,...,...,...
95,Looking for the original base music used in th...,9999999999,96,12.078296,6.019934,827.337322,54.393068,37.606242,57.373321,7.349427,1.303939
96,Oddly creepy carnival-esque theme I've attempt...,9999999999,97,11.790614,5.854648,467.518752,51.418904,38.959896,28.748587,4.252976,1.064762
97,Does anyone know what this piano song is calle...,9999999999,98,6.494800,4.027536,199.984704,51.323532,39.996941,9.756019,3.225641,0.975602
98,Famous classical piece A famous classical piec...,9999999999,99,12.078296,4.737665,268.675562,51.418904,33.584445,9.399138,4.252976,0.522174


In [3]:
report = {}

for test_type in test_sets.keys():
    df = report_for_test_set(test_type)
    
    entry = {'category': test_type.split('-')[0]}
    for predictor in pre_retrieval_predictors:
        corrs = get_corr(df, predictor)
        for k, v in corrs.items():
            entry[(predictor, k)] = v
    report[entry['category']] = entry
df_report = pd.DataFrame([v for _, v in report.items()])
df_report

Unnamed: 0,category,"(max-idf, pearson)","(max-idf, kendall)","(max-idf, spearman)","(avg-idf, pearson)","(avg-idf, kendall)","(avg-idf, spearman)","(scq, pearson)","(scq, kendall)","(scq, spearman)",...,"(avg-scq, spearman)","(var, pearson)","(var, kendall)","(var, spearman)","(max-var, pearson)","(max-var, kendall)","(max-var, spearman)","(avg-var, pearson)","(avg-var, kendall)","(avg-var, spearman)"
0,music,0.019781,0.040471,0.0484,0.09063,0.053437,0.072288,0.015245,0.068704,0.092326,...,-0.076151,0.021953,0.065651,0.090606,-0.017064,0.007432,0.010266,-0.049554,-0.060053,-0.073598
1,game,0.179761,0.14948,0.169995,0.032462,0.043925,0.058079,0.032641,0.045066,0.059628,...,-0.085148,0.035056,0.0502,0.068319,0.04623,0.050761,0.057167,0.048934,-0.045066,-0.049925
2,movie,-0.195318,-0.063918,-0.079259,-0.16201,-0.210695,-0.308588,-0.022113,0.030099,0.050873,...,-0.038978,-0.001597,0.039983,0.058602,0.0418,0.042348,0.056683,0.014206,-0.057952,-0.074255
3,book,0.10181,0.158318,0.18156,-0.131812,-0.09421,-0.126049,0.014359,0.044062,0.061971,...,-0.156119,0.006549,0.018745,0.033077,-0.029911,0.000812,-0.000335,-0.164086,-0.108816,-0.156251
4,all,0.073893,0.055831,0.064369,0.067807,0.015574,0.022004,0.090814,0.11381,0.159226,...,-0.063914,0.1023,0.124832,0.175366,0.026852,0.079367,0.101219,0.004588,-0.061098,-0.073308


In [6]:
def f(i):
    return "{:.2f}".format(i).replace('0.', '.')

def table_line(display_name, internal_name):
    ret = display_name +' '
    
    for qpp_method in ['max-idf', 'avg-idf', 'scq', 'avg-scq', 'var', 'max-var', 'avg-var']:
        for m in ['kendall', 'spearman', 'pearson']:
            ret += '& ' + f(report[internal_name][(qpp_method, m)]) + ' '
    
    return ret + '\\\\'

def latex_table():
    return '''\\begin{table*}[t]%
\\centering%
\\footnotesize%
\\renewcommand{\\tabcolsep}{2.4pt}%
\\caption{Kendall rank correlation coefficient ($\\tau$), Spearman's rank correlation ($\\rho$), Pearson correlation coefficient ($r$), {\\color{red} ToDo (Maik): } The effectiveness of 4 methods in the known-item qpp effectiveness task.}%
\\label{table-known-item-qpp-effectiveness}%
\\begin{tabular}[t]{@{}l@{\\hspace*{.15cm}}rrrrrrrrrrrrrrrrrrrrr@{}}
\\toprule
\\textbf{Category} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{max-idf}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{avg-idf}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{scq}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{avg-scq}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{var}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{max-var}} & \\multicolumn{3}{c@{\\hspace{1em}}}{\\textbf{avg-var}}\\\\

\\cmidrule(l@{\\tabcolsep}r@{.5em}){2-4}
\\cmidrule(l@{\\tabcolsep}r@{.5em}){5-7}
\\cmidrule(l@{\\tabcolsep}r@{.5em}){8-10}
\\cmidrule(l@{\\tabcolsep}r@{.5em}){11-13}
\\cmidrule(l@{\\tabcolsep}r@{.5em}){14-16}
\\cmidrule(l@{\\tabcolsep}r@{.5em}){17-19}
\\cmidrule{20-22}

& $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$ & $\\tau$ & $\\rho$ & $r$  \\\\

\\midrule
''' + table_line('Movies', 'movie') + '''
''' + table_line('Music', 'music') + '''
''' + table_line('Books', 'book') + '''
''' + table_line('Games', 'game') + '''

\\midrule

''' + table_line('All', 'all') + '''

\\bottomrule
\\end{tabular}
\\end{table*} 
'''

print(latex_table())

\begin{table*}[t]%
\centering%
\footnotesize%
\renewcommand{\tabcolsep}{2.4pt}%
\caption{Kendall rank correlation coefficient ($\tau$), Spearman's rank correlation ($\rho$), Pearson correlation coefficient ($r$), {\color{red} ToDo (Maik): } The effectiveness of 4 methods in the known-item qpp effectiveness task.}%
\label{table-known-item-qpp-effectiveness}%
\begin{tabular}[t]{@{}l@{\hspace*{.15cm}}rrrrrrrrrrrrrrrrrrrrr@{}}
\toprule
\textbf{Category} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{max-idf}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{avg-idf}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{scq}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{avg-scq}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{var}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{max-var}} & \multicolumn{3}{c@{\hspace{1em}}}{\textbf{avg-var}}\\

\cmidrule(l@{\tabcolsep}r@{.5em}){2-4}
\cmidrule(l@{\tabcolsep}r@{.5em}){5-7}
\cmidrule(l@{\tabcolsep}r@{.5em}){8-10}
\cmidrule(l@{\tabcolsep}r@{.5em}){11-13}
\cmidrule(l@{\