In [12]:
data_dir = "data"

import pandas as pd
from qasper_data.qasper_evaluator import token_f1_score


In [13]:
df = pd.read_csv(f"{data_dir}/baseline_answers.csv")

# calculate f1 score for each answer
df["f1"], df["precision"], df["recall"] = zip(
    *df.apply(lambda row: token_f1_score(row["answer_string"], row["prediction"]), axis=1)
)

In [14]:
df.head()

Unnamed: 0,question,paper_id,answer_type,answer_string,prediction,f1,precision,recall
0,How big is the ANTISCAM dataset?,1911.10742,extractive,"3,044 sentences in 100 dialogs",Based on the information provided in the con...,0.142857,0.4,0.086957
1,How big is the ANTISCAM dataset?,1911.10742,extractive,220 human-human dialogs,Based on the information provided in the con...,0.076923,0.333333,0.043478
2,How big is the ANTISCAM dataset?,1911.10742,extractive,"220 human-human dialogs. , 3,044 sentences in ...",Based on the information provided in the con...,0.129032,0.25,0.086957
3,How big is the ANTISCAM dataset?,1911.10742,extractive,220 human-human dialogs. The average conversat...,Based on the information provided in the con...,0.102564,0.125,0.086957
4,How big is the ANTISCAM dataset?,1911.10742,extractive,220 human-human dialogs,Based on the information provided in the con...,0.076923,0.333333,0.043478


For each question we only select the answer with the highest score.

In [26]:
df = df.sort_values(by=["question", "f1"], ascending=False).groupby("question").head(1)
# sort by index
df = df.sort_index()

In [27]:
df.head()

Unnamed: 0,question,paper_id,answer_type,answer_string,prediction,f1,precision,recall
0,How big is the ANTISCAM dataset?,1911.10742,extractive,"3,044 sentences in 100 dialogs",Based on the information provided in the con...,0.142857,0.4,0.086957
8,What is the accuracy of this model compared to...,1904.09131,abstractive,The micro and macro f1-scores of this model ar...,"Based on the provided context information, I...",0.123596,0.333333,0.075862
15,What previous methods do they compare against?,1611.06322,abstractive,Liu et al. (2015) and Yang et al. (2012),"Based on the provided context information, t...",0.28,0.777778,0.170732
18,What baselines did they compare with?,1604.02038,extractive,"LDA BIBREF2, Doc-NADE BIBREF24, HTMM BIBREF9, ...","Based on the provided context information, t...",0.432432,1.0,0.275862
23,Which NER dataset do they use?,1911.04474,extractive,"CoNLL2003, OntoNotes 5.0, BIBREF35 released On...","Based on the provided context information, t...",0.415584,0.727273,0.290909


In [16]:
answer_by_types = df.groupby("answer_type").agg(
    {"f1": ["mean", "std"], "precision": ["mean", "std"], "recall": ["mean", "std"]}
)


In [17]:

answer_by_types

Unnamed: 0_level_0,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std
answer_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
abstractive,0.192305,0.107247,0.521445,0.180442,0.121503,0.074063
boolean,0.032258,,1.0,,0.016393,
extractive,0.381904,0.168132,0.660653,0.263724,0.303432,0.194598


# Quantiative analysis

In [18]:
examples_extractive = df[df['answer_type'] == 'extractive'].sort_values(by='f1').head(3)
examples_abstractive = df[df['answer_type'] == 'abstractive'].sort_values(by='f1').head(3)
examples_boolean = df[df['answer_type'] == 'boolean'].sort_values(by='f1').head(3)
examples_none = df[df['answer_type'] == 'none'].sort_values(by='f1').head(2)  # only two instances

# combine examples for display
combined_examples = pd.concat([examples_extractive, examples_abstractive, examples_boolean, examples_none])

In [19]:
combined_examples

Unnamed: 0,question,paper_id,answer_type,answer_string,prediction,f1,precision,recall
0,How big is the ANTISCAM dataset?,1911.10742,extractive,"3,044 sentences in 100 dialogs",Based on the information provided in the con...,0.142857,0.4,0.086957
23,Which NER dataset do they use?,1911.04474,extractive,"CoNLL2003, OntoNotes 5.0, BIBREF35 released On...","Based on the provided context information, t...",0.415584,0.727273,0.290909
18,What baselines did they compare with?,1604.02038,extractive,"LDA BIBREF2, Doc-NADE BIBREF24, HTMM BIBREF9, ...","Based on the provided context information, t...",0.432432,1.0,0.275862
36,How do data-driven models usually respond to a...,1909.04387,abstractive,"either by refusing politely, or, with flirtati...","Based on the provided context information, I...",0.051282,0.363636,0.027586
8,What is the accuracy of this model compared to...,1904.09131,abstractive,The micro and macro f1-scores of this model ar...,"Based on the provided context information, I...",0.123596,0.333333,0.075862
29,What are the contributions of this paper?,1810.02229,abstractive,(1) Using seq2seq for event detection and clas...,"Based on the provided context information, t...",0.197368,0.576923,0.119048
40,Was the automatic annotation evaluated?,2003.13016,boolean,No,"Based on the provided context information, t...",0.032258,1.0,0.016393


## Extractive Answers
Question: How do data-driven models usually respond to a flirt?
Answer String: politely refuse, politely refuses, flirtatious...
F1 Score: 0.0132, Precision: 0.0069, Recall: 0.1667

Question: How do data-driven models usually respond to a flirt?
Answer String: Data-driven systems rank low in general
F1 Score: 0.0397, Precision: 0.0207, Recall: 0.5000
Question: How big is the ANTISCAM dataset?

Answer String: 220 human-human dialogs
F1 Score: 0.0769, Precision: 0.0435, Recall: 0.3333

## Abstractive Answers
Question: How do data-driven models usually respond to a flirt?
Answer String: flirt; retaliation
F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000

Question: What is the accuracy of this model compared to others?
Answer String: The accuracy
F1 Score: 0.0137, Precision: 0.0069, Recall: 1.0000

Question: How do data-driven models usually respond to a flirt?
Answer String: either by refusing politely, or, with flirtation...
F1 Score: 0.0513, Precision: 0.0276, Recall: 0.3636

## Boolean Answers
Question: Was the automatic annotation evaluated?
Answer String: Yes
F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000

Question: Was the automatic annotation evaluated?
Answer String: Yes
F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000

Question: Was the automatic annotation evaluated?
Answer String: No
F1 Score: 0.0323, Precision: 0.0164, Recall: 1.0000

## None Answers
Question: What is the accuracy of this model compared to others?

Answer String: Unacceptable
F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Question: What is the accuracy of this model compared to others?

Answer String: Unacceptable
F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000

## Analysis:
Extractive Answers: The predictions seem to miss key tokens from the answers, leading to low precision and F1 scores.
Abstractive Answers: These require paraphrasing, which can be challenging, leading to mismatches and lower scores.
Boolean Answers: The predictions often fail to provide a direct yes/no answer, resulting in very low scores.
None Answers: With no clear answer to match, the F1 scores are understandably zero.

# Analysis the finetuned model

In [20]:
import pandas as pd
from qasper_data.qasper_evaluator import token_f1_score
data_dir = "data"

finetuned_df = pd.read_csv(f"{data_dir}/finetune_finetune_answers.csv")

In [21]:
finetuned_df["f1"], finetuned_df["precision"], finetuned_df["recall"] = zip(
    *finetuned_df.apply(lambda row: token_f1_score(row["answer_string"], row["prediction"]), axis=1)
)

In [28]:
finetuned_df.head()

finetuned_df = finetuned_df.sort_values(by=["question", "f1"], ascending=False).groupby("question").head(1)

finetuned_df = finetuned_df.sort_index()

In [29]:
finetuned_df.head()

Unnamed: 0,question,paper_id,answer_type,answer_string,prediction,f1,precision,recall
3,How big is the ANTISCAM dataset?,1911.10742,extractive,220 human-human dialogs. The average conversat...,"Based on the provided context information, I...",0.571429,0.875,0.424242
10,How is intent annotated?,1911.10742,extractive,we design a hierarchical intent annotation sch...,"Based on the provided context information, I...",0.478134,0.431579,0.535948
16,What are the baselines outperformed by this work?,1911.10742,extractive,TransferTransfo The vanilla TransferTransfo fr...,"Based on the provided context information, t...",0.146341,0.3,0.096774
20,What are the evaluation metrics and criteria u...,1911.10742,extractive,Fluency Fluency is used to explore different m...,"Based on the provided context information, t...",0.375,0.394161,0.357616
24,What is the accuracy of this model compared to...,1904.09131,abstractive,The model improves the state of the art perfor...,"Based on the provided context information, I...",0.139535,0.26087,0.095238


In [30]:
finetuned_answer_by_types = finetuned_df.groupby("answer_type").agg(
    {"f1": ["mean", "std"], "precision": ["mean", "std"], "recall": ["mean", "std"]}
)


In [31]:
finetuned_answer_by_types

Unnamed: 0_level_0,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std
answer_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
abstractive,0.136164,0.087456,0.505428,0.355627,0.128095,0.159138
boolean,0.0,,0.0,,0.0,
extractive,0.320536,0.170413,0.655751,0.247937,0.243739,0.157843
none,0.0,,0.0,,0.0,


# Inspect the Boolean answers

In [33]:
finetuned_df = pd.read_csv(f"{data_dir}/finetune_finetune_answers.csv")

finetuned_df[finetuned_df['answer_type'] == 'boolean'].head(10)

Unnamed: 0,question,paper_id,answer_type,answer_string,prediction
39,Do they build a dataset of rumors?,1611.06322,boolean,Yes,"Based on the provided context information, i..."
40,Do they build a dataset of rumors?,1611.06322,boolean,Yes,"Based on the provided context information, i..."
41,Do they build a dataset of rumors?,1611.06322,boolean,Yes,"Based on the provided context information, i..."
43,Do they build a dataset of rumors?,1611.06322,boolean,Yes,"Based on the provided context information, i..."
66,How do they incorporate direction and relative...,1911.04474,boolean,Yes,"Based on the given context information, to i..."
69,Do they outperform current NER state-of-the-ar...,1911.04474,boolean,No,"Based on the given context information, the ..."
70,Do they outperform current NER state-of-the-ar...,1911.04474,boolean,Yes,"Based on the given context information, the ..."
71,Do they outperform current NER state-of-the-ar...,1911.04474,boolean,Yes,"Based on the given context information, the ..."
94,Can the model be extended to other languages?,1810.02229,boolean,Yes,"Based on the information provided, the Bi-LS..."
125,Was the automatic annotation evaluated?,2003.13016,boolean,No,"Based on the provided context information, i..."
