In [1]:
import pandas as pd

# path to metrics files
data_path = './data/cast2020/'

# QA metrics

In [2]:
# load query_metrics file, which is generated from a trec run file using:
# `grep '^map\s\|recip_rank\|^P_1\|^P_3\|ndcg_cut_1\|ndcg_cut_3\|ndcg_cut_5' original.run | grep -v "all" > original_query_metrics.txt`
implicit_df = pd.read_csv(data_path + 'original_query_metrics.txt', delimiter='\t', header=None)
qr_df = pd.read_csv(data_path + 'quretec_qna_query_metrics.txt', delimiter='\t', header=None)  # official_rewrite_query_metrics quretec_qna_query_metrics
human_df = pd.read_csv(data_path + 'human_query_metrics.txt', delimiter='\t', header=None)
human_df.tail()

Unnamed: 0,0,1,2
1451,P_1,99_8,1.0
1452,P_3,99_8,1.0
1453,ndcg_cut_1,99_8,1.0
1454,ndcg_cut_3,99_8,1.0
1455,ndcg_cut_5,99_8,0.8688


Unnamed: 0,0,1,2
1451,P_1,99_8,1.0
1452,P_3,99_8,1.0
1453,ndcg_cut_1,99_8,1.0
1454,ndcg_cut_3,99_8,1.0
1455,ndcg_cut_5,99_8,0.8688


In [3]:
# filter out metric
metric = 'ndcg_cut_3'
implicit_df = implicit_df[implicit_df[0].str.contains(metric)]
qr_df = qr_df[qr_df[0].str.contains(metric)]
human_df = human_df[human_df[0].str.contains(metric)]
human_df.head()

Unnamed: 0,0,1,2
5,ndcg_cut_3,100_1,0.75
12,ndcg_cut_3,100_2,0.7295
19,ndcg_cut_3,100_3,0.9013
26,ndcg_cut_3,100_4,0.2705
33,ndcg_cut_3,100_5,0.2705


Unnamed: 0,0,1,2
5,ndcg_cut_3,100_1,0.75
12,ndcg_cut_3,100_2,0.7295
19,ndcg_cut_3,100_3,0.9013
26,ndcg_cut_3,100_4,0.2705
33,ndcg_cut_3,100_5,0.2705


In [4]:
# create a table to compare results for different rewrites
turn_breakdown_df = pd.DataFrame({
    'turn': implicit_df[1],
    'implicit': implicit_df[2],
    'qr': qr_df[2],
    'human': human_df[2]
})
turn_breakdown_df.head()

Unnamed: 0,turn,implicit,qr,human
5,100_1,0.148,0.75,0.75
12,100_2,0.0,0.7295,0.7295
19,100_3,0.0,0.0,0.9013
26,100_4,0.2882,0.4412,0.2705
33,100_5,0.0,0.1707,0.2705


Unnamed: 0,turn,implicit,qr,human
5,100_1,0.148,0.75,0.75
12,100_2,0.0,0.7295,0.7295
19,100_3,0.0,0.0,0.9013
26,100_4,0.2882,0.4412,0.2705
33,100_5,0.0,0.1707,0.2705


# Error analysis

In [5]:
# NDCG@3 > 0 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0])  # 111

20
0
7
1
51
2
88
39
20
0
7
1
51
2
88
39


In [6]:
# NDCG@3 > 0.5 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0])  # 111

88
2
3
1
42
1
65
6
88
2
3
1
42
1
65
6


In [7]:
# NDCG@3 = 1 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0])  # 111

185
0
1
0
10
0
10
2
185
0
1
0
10
0
10
2


# Error Plot

In [8]:
from collections import defaultdict
import numpy as np

union_keys = list(turn_breakdown_df['turn'])
implicit_scores = {row[0]: {metric: row[1]} for row in turn_breakdown_df.values}
gen_scores = {row[0]: {metric: row[2]} for row in turn_breakdown_df.values}
human_scores = {row[0]: {metric: row[3]} for row in turn_breakdown_df.values}

vals = defaultdict(lambda: defaultdict(int))
thresholds = [0.0001] + list(np.arange(0.02, 1.02, 0.02))
thresholds = [round(t, 6) for t in thresholds]
for k in union_keys:
    for t in thresholds:
        gen_val = int(gen_scores[k][metric] >= t)
        hum_val = int(human_scores[k][metric] >= t)
        imp_val = int(implicit_scores[k][metric] >= t)
        vals[f"Original ({imp_val}) | QR ({gen_val}) | Human ({hum_val})"][t] += 1
print(vals)
# 100 missing
print(len(vals))

source = defaultdict(list)
thresholds = [0.0001] + list(np.arange(0.02, 1.02, 0.02))
thresholds = [round(t, 6) for t in thresholds]
thresholds.reverse()
print(thresholds)
for val_key, threshold_map in vals.items():
    for t in thresholds:
        freq = threshold_map[t]
        source["Score Card"].append(str(val_key))
        source["NDCG@3 Threshold"].append(t)
        source["Percentage of Test Set"].append(freq)
retrieval_qa_source = pd.DataFrame(source)

defaultdict(<function <lambda> at 0x11dacebf8>, {'Original (1) | QR (1) | Human (1)': defaultdict(<class 'int'>, {0.0001: 39, 0.02: 39, 0.04: 39, 0.06: 37, 0.08: 33, 0.1: 31, 0.12: 29, 0.14: 28, 0.16: 25, 0.18: 23, 0.2: 22, 0.22: 22, 0.24: 18, 0.26: 18, 0.28: 16, 0.3: 15, 0.32: 14, 0.34: 14, 0.36: 12, 0.38: 12, 0.4: 12, 0.42: 11, 0.44: 11, 0.46: 11, 0.48: 6, 0.5: 6, 0.52: 5, 0.54: 5, 0.56: 4, 0.58: 4, 0.6: 3, 0.62: 3, 0.64: 3, 0.66: 3, 0.68: 3, 0.7: 3, 0.72: 3, 0.74: 3, 0.76: 3, 0.78: 2, 0.8: 2, 0.82: 2, 0.84: 2, 0.86: 2, 0.88: 2, 0.9: 2, 0.92: 2, 0.94: 2, 0.96: 2, 0.98: 2, 1.0: 2}), 'Original (0) | QR (1) | Human (1)': defaultdict(<class 'int'>, {0.16: 96, 0.18: 91, 0.2: 89, 0.22: 88, 0.24: 88, 0.26: 85, 0.28: 85, 0.3: 83, 0.32: 81, 0.34: 79, 0.36: 77, 0.38: 75, 0.4: 75, 0.42: 74, 0.44: 71, 0.46: 71, 0.48: 65, 0.5: 65, 0.52: 66, 0.54: 62, 0.56: 58, 0.58: 55, 0.6: 55, 0.62: 51, 0.64: 51, 0.66: 50, 0.68: 48, 0.7: 46, 0.72: 44, 0.74: 39, 0.0001: 88, 0.02: 88, 0.04: 88, 0.06: 90, 0.08: 91

In [9]:
import altair as alt
# from altair_saver import save

error_chart = alt.Chart(retrieval_qa_source).mark_area().encode(
    x="NDCG@3 Threshold:Q",
    y=alt.Y("Percentage of Test Set:Q", stack="normalize"),
    color="Score Card:N")

error_chart
# alt.renderers.enable('png')
# save(error_chart, "chart.png")

# QR

In [None]:
# load rewrites
qr_df = pd.read_csv(data_path + 'QuReTeC_QnA.tsv', delimiter='\t')  # OfficialBaseline QuReTeC_QnA
human_df = pd.read_csv(data_path + 'Human.tsv', delimiter='\t')
qr_df['human'] = human_df['query']
qr_df.head()

# Sampling

In [None]:
# show samples with QR errors
qr_errors = turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)]
qr_errors = turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)]
print(qr_errors) # 001

In [None]:
pd.set_option('display.max_colwidth', -1)
error_samples = pd.merge(qr_errors, qr_df, how='left', left_on='turn', right_on='id')
error_samples[['id', 'human_y', 'human_x', 'query', 'qr']]
# error_samples

In [None]:
# show samples where QR worked better than Human
qr_wins = turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)]
qr_wins = turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)]
print(qr_wins) # 010

In [None]:
wins_samples = pd.merge(qr_wins, qr_df, how='left', left_on='turn', right_on='id')
wins_samples[['id', 'human_y', 'human_x', 'query', 'qr']]