In [3]:
import pandas as pd

# path to metrics files
data_path = './data/cast2020/'

# QA metrics

In [4]:
# load query_metrics file, which is generated from a trec run file using:
# `grep '^map\s\|recip_rank\|^P_1\|^P_3\|ndcg_cut_1\|ndcg_cut_3\|ndcg_cut_5' original.run | grep -v "all" > original_query_metrics.txt`
implicit_df = pd.read_csv(data_path + 'original_query_metrics.txt', delimiter='\t', header=None)
qr_df = pd.read_csv(data_path + 'quretec_qna_query_metrics.txt', delimiter='\t', header=None)  # official_rewrite_query_metrics quretec_qna_query_metrics
human_df = pd.read_csv(data_path + 'human_query_metrics.txt', delimiter='\t', header=None)
human_df.tail()

Unnamed: 0,0,1,2
1451,P_1,99_8,1.0
1452,P_3,99_8,1.0
1453,ndcg_cut_1,99_8,1.0
1454,ndcg_cut_3,99_8,1.0
1455,ndcg_cut_5,99_8,0.8688


In [5]:
# filter out metric
metric = 'ndcg_cut_3'
implicit_df = implicit_df[implicit_df[0].str.contains(metric)]
qr_df = qr_df[qr_df[0].str.contains(metric)]
human_df = human_df[human_df[0].str.contains(metric)]
human_df.head()

Unnamed: 0,0,1,2
5,ndcg_cut_3,100_1,0.75
12,ndcg_cut_3,100_2,0.7295
19,ndcg_cut_3,100_3,0.9013
26,ndcg_cut_3,100_4,0.2705
33,ndcg_cut_3,100_5,0.2705


In [6]:
# create a table to compare results for different rewrites
turn_breakdown_df = pd.DataFrame({
    'turn': implicit_df[1],
    'implicit': implicit_df[2],
    'qr': qr_df[2],
    'human': human_df[2]
})
turn_breakdown_df.head()

Unnamed: 0,turn,implicit,qr,human
5,100_1,0.148,0.75,0.75
12,100_2,0.0,0.7295,0.7295
19,100_3,0.0,0.0,0.9013
26,100_4,0.2882,0.4412,0.2705
33,100_5,0.0,0.1707,0.2705


# Error analysis

In [7]:
# NDCG@3 > 0 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)].shape[0])  # 111

20
0
7
1
51
2
88
39


In [8]:
# NDCG@3 > 0.5 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & ~(turn_breakdown_df.human >= 0.5)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & ~(turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit >= 0.5) & (turn_breakdown_df.qr >= 0.5) & (turn_breakdown_df.human >= 0.5)].shape[0])  # 111

88
2
3
1
42
1
65
6


In [9]:
# NDCG@3 = 1 column
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 000
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 100
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 010
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)].shape[0]) # 110 
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 001
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 101
print(turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0]) # 011
print(turn_breakdown_df[(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)].shape[0])  # 111

185
0
1
0
10
0
10
2


# Error Plot

In [10]:
from collections import defaultdict
import numpy as np

union_keys = list(turn_breakdown_df['turn'])
implicit_scores = {row[0]: {metric: row[1]} for row in turn_breakdown_df.values}
gen_scores = {row[0]: {metric: row[2]} for row in turn_breakdown_df.values}
human_scores = {row[0]: {metric: row[3]} for row in turn_breakdown_df.values}

vals = defaultdict(lambda: defaultdict(int))
thresholds = [0.0001] + list(np.arange(0.02, 1.02, 0.02))
thresholds = [round(t, 6) for t in thresholds]
for k in union_keys:
    for t in thresholds:
        gen_val = int(gen_scores[k][metric] >= t)
        hum_val = int(human_scores[k][metric] >= t)
        imp_val = int(implicit_scores[k][metric] >= t)
        vals[f"Original ({imp_val}) | QR ({gen_val}) | Human ({hum_val})"][t] += 1
print(vals)
# 100 missing
print(len(vals))

source = defaultdict(list)
thresholds = [0.0001] + list(np.arange(0.02, 1.02, 0.02))
thresholds = [round(t, 6) for t in thresholds]
thresholds.reverse()
print(thresholds)
for val_key, threshold_map in vals.items():
    for t in thresholds:
        freq = threshold_map[t]
        source["Score Card"].append(str(val_key))
        source["NDCG@3 Threshold"].append(t)
        source["Percentage of Test Set"].append(freq)
retrieval_qa_source = pd.DataFrame(source)

defaultdict(<function <lambda> at 0x11e6dfea0>, {'Original (1) | QR (1) | Human (1)': defaultdict(<class 'int'>, {0.0001: 39, 0.02: 39, 0.04: 39, 0.06: 37, 0.08: 33, 0.1: 31, 0.12: 29, 0.14: 28, 0.16: 25, 0.18: 23, 0.2: 22, 0.22: 22, 0.24: 18, 0.26: 18, 0.28: 16, 0.3: 15, 0.32: 14, 0.34: 14, 0.36: 12, 0.38: 12, 0.4: 12, 0.42: 11, 0.44: 11, 0.46: 11, 0.48: 6, 0.5: 6, 0.52: 5, 0.54: 5, 0.56: 4, 0.58: 4, 0.6: 3, 0.62: 3, 0.64: 3, 0.66: 3, 0.68: 3, 0.7: 3, 0.72: 3, 0.74: 3, 0.76: 3, 0.78: 2, 0.8: 2, 0.82: 2, 0.84: 2, 0.86: 2, 0.88: 2, 0.9: 2, 0.92: 2, 0.94: 2, 0.96: 2, 0.98: 2, 1.0: 2}), 'Original (0) | QR (1) | Human (1)': defaultdict(<class 'int'>, {0.16: 96, 0.18: 91, 0.2: 89, 0.22: 88, 0.24: 88, 0.26: 85, 0.28: 85, 0.3: 83, 0.32: 81, 0.34: 79, 0.36: 77, 0.38: 75, 0.4: 75, 0.42: 74, 0.44: 71, 0.46: 71, 0.48: 65, 0.5: 65, 0.52: 66, 0.54: 62, 0.56: 58, 0.58: 55, 0.6: 55, 0.62: 51, 0.64: 51, 0.66: 50, 0.68: 48, 0.7: 46, 0.72: 44, 0.74: 39, 0.0001: 88, 0.02: 88, 0.04: 88, 0.06: 90, 0.08: 91

In [11]:
import altair as alt
# from altair_saver import save

error_chart = alt.Chart(retrieval_qa_source).mark_area().encode(
    x="NDCG@3 Threshold:Q",
    y=alt.Y("Percentage of Test Set:Q", stack="normalize"),
    color="Score Card:N")

error_chart
# alt.renderers.enable('png')
# save(error_chart, "chart.png")

# QR

In [12]:
# load rewrites
qr_df = pd.read_csv(data_path + 'QuReTeC_QnA.tsv', delimiter='\t')  # OfficialBaseline QuReTeC_QnA
human_df = pd.read_csv(data_path + 'Human.tsv', delimiter='\t')
qr_df['human'] = human_df['query']
qr_df.head()

Unnamed: 0,conversation_id,turn_id,id,query,original,human
0,81,1,81_1,How do you know when your garage door opener i...,How do you know when your garage door opener i...,How do you know when your garage door opener i...
1,81,2,81_2,Now it stopped working. Why? door garage light...,Now it stopped working. Why?,Now my garage door opener stopped working. Why?
2,81,3,81_3,How much does it cost for someone to fix it? d...,How much does it cost for someone to fix it?,How much does it cost for someone to repair a ...
3,81,4,81_4,How about replacing it instead? door garage op...,How about replacing it instead?,How much does it cost to replace a garage door...
4,81,5,81_5,How do I choose a new one? door garage opener,How do I choose a new one?,How do I choose a new garage door opener?


# Sampling

In [22]:
# show samples with QR errors
# qr_errors = turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & ~(turn_breakdown_df.qr == 1) & (turn_breakdown_df.human == 1)]
qr_errors = turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & ~(turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)]
print(qr_errors) # 001

        turn  implicit   qr   human
19    100_3   0.0       0.0  0.9013
47    100_7   0.0       0.0  0.4115
89    101_4   0.0       0.0  0.9218
110   101_7   0.0       0.0  0.8647
117   101_8   0.0       0.0  0.8827
159   102_5   0.0       0.0  0.7039
180   102_8   0.0       0.0  0.4134
187   102_9   0.0       0.0  1.0000
201   103_10  0.0       0.0  0.5089
215   103_3   0.0       0.0  0.7305
222   103_4   0.0       0.0  1.0000
236   103_6   0.0       0.0  0.3212
271   104_12  0.0       0.0  0.3394
299   104_6   0.0       0.0  0.4050
313   104_8   0.0       0.0  0.1173
369   105_7   0.0       0.0  1.0000
411   81_4    0.0       0.0  0.3739
509   82_9    0.0       0.0  0.9413
579   84_2    0.0       0.0  0.2933
586   84_3    0.0       0.0  0.7039
607   84_6    0.0       0.0  0.1564
649   85_6    0.0       0.0  0.6020
656   85_7    0.0       0.0  0.7244
733   87_2    0.0       0.0  0.1564
747   87_4    0.0       0.0  0.6462
789   88_10   0.0       0.0  0.5102
803   88_3    0.0       0.0 

In [20]:
pd.set_option('display.max_colwidth', -1)
error_samples = pd.merge(qr_errors, qr_df, how='left', left_on='turn', right_on='id')
error_samples[['id', 'human_y', 'human_x', 'query', 'qr']]
# error_samples

Unnamed: 0,id,human_y,human_x,query,qr
0,100_3,What are the differences between dental crowns and veneers?,0.9013,"I have a larger chip, how do the options different? teeth tooth chipped chip repairing dental",0.0
1,100_7,What is the coverage of the crown in medicare?,0.4115,I meant medicare,0.0
2,101_4,How old is Barron Trump?,0.9218,How old is he? melania donald trump,0.0
3,101_7,Does the public pay the First Lady of the United States?,0.8647,Do we pay the First Lady? melania trump,0.0
4,101_8,Does the public pay Ivanka Trump?,0.8827,What about Ivanka? melania melanija trump,0.0
5,102_5,How much money is owed to social security?,0.7039,How much is owed? program social security,0.0
6,102_8,Can social security be fixed?,0.4134,Can it be fixed? checks social check security,0.0
7,102_9,How much of a tax increase will keep social security solvent?,1.0,How much of an increase? social security,0.0
8,103_10,Why did the Grateful Dead break up?,0.5089,Why did the band break up? dead,0.0
9,103_3,Did the Grateful Dead have a number one hit?,0.7305,Did they have a #1 hit? dead,0.0


In [21]:
# show samples where QR worked better than Human
# qr_wins = turn_breakdown_df[~(turn_breakdown_df.implicit == 1) & (turn_breakdown_df.qr == 1) & ~(turn_breakdown_df.human == 1)]
qr_wins = turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & ~(turn_breakdown_df.human > 0)]
print(qr_wins) # 010

       turn  implicit      qr  human
124   101_9  0.0       0.2961  0.0  
341   105_3  0.0       0.2015  0.0  
992   91_2   0.0       0.2658  0.0  
1013  91_5   0.0       0.2421  0.0  
1055  92_3   0.0       0.2029  0.0  
1125  93_6   0.0       0.5209  0.0  
1391  98_7   0.0       0.2961  0.0  


In [18]:
wins_samples = pd.merge(qr_wins, qr_df, how='left', left_on='turn', right_on='id')
wins_samples[['id', 'human_y', 'human_x', 'query', 'qr']]

Unnamed: 0,id,human_y,human_x,query,qr
0,101_9,Does the public pay Jared Kushner?,0.0,And Jared? ivana donald trump,0.2961
1,105_3,Why was George Zimmerman acquitted?,0.0,Why was he acquitted? george trayvon martin zimmerman,0.2015
2,91_2,What is different between General Data Protection Regulation (GDPR) and European Union (EU) Data Protection Directive 95/46/EC?,0.0,What is different compared to previous legislation? general data protection gdpr,0.2658
3,91_5,How do big companies in the United States (US) adapt to the General Data Protection Regulation (GDPR)?,0.0,How do big companies adapt to GDPR?,0.2421
4,92_3,What are other exercises besides Pendulum circles for shoulder injuries?,0.0,What are some others? shoulder circles cuff pendulum braces exercises rotator,0.2029
5,93_6,What support does the franchise provide?,0.0,What support does it provide? king franchise agreement burger,0.5209
6,98_7,Can you show me vegetarian recipes with almonds?,0.0,Oh almonds? Can you show me recipes with it? almonds,0.2961


In [23]:
# show samples where QR worked same as Human and implicit did not work
qr_wins = turn_breakdown_df[~(turn_breakdown_df.implicit > 0) & (turn_breakdown_df.qr > 0) & (turn_breakdown_df.human > 0)]
print(qr_wins) # 011

        turn  implicit      qr   human
12    100_2   0.0       0.7295  0.7295
33    100_5   0.0       0.1707  0.2705
54    100_8   0.0       0.3194  0.3194
61    101_1   0.0       0.4693  0.4693
68    101_10  0.0       0.5680  0.6462
...      ...  ...          ...     ...
1384  98_6    0.0       0.4693  0.1760
1405  99_1    0.0       0.2346  0.2346
1426  99_4    0.0       1.0000  0.7654
1440  99_6    0.0       0.4693  1.0000
1454  99_8    0.0       1.0000  1.0000

[88 rows x 4 columns]


In [24]:
wins_samples = pd.merge(qr_wins, qr_df, how='left', left_on='turn', right_on='id')
wins_samples[['id', 'human_y', 'human_x', 'query', 'qr']]

Unnamed: 0,id,human_y,human_x,query,qr
0,100_2,"Oh, what can dentists do to fix a chipped tooth?",0.7295,"Oh, what can they do to fix it? teeth tooth broken chipped chip dentist",0.7295
1,100_5,Which one of dental implants or crowns is more expensive?,0.2705,Which one is more expensive? crown dental implant,0.1707
2,100_8,Show me medicare dentists in Seattle,0.3194,Show me covered dentists in Seattle,0.3194
3,101_1,What is Melania Trump's religion?,0.4693,What is Melania Trump's religion?,0.4693
4,101_10,What does Jared Kushner do at the White House?,0.6462,What does he do at the White House? melania jared trump,0.5680
...,...,...,...,...,...
83,98_6,What are the sources of proteins for vegetarians?,0.1760,What are their sources of proteins?,0.4693
84,99_1,What is high blood carbon dioxide?,0.2346,What is high blood carbon dioxide?,0.2346
85,99_4,What should I eat to improve high cholesterol?,0.7654,What should I eat to improve it? cholesterol high,1.0000
86,99_6,What are the benefits of unsaturated fats?,1.0000,"So, there are two types. Is the other fat good for you? saturated",0.4693
