In [2]:
from typing import List

from dotenv import load_dotenv
import os

load_dotenv()
from langchain.llms.vllm import VLLMOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
import sacrebleu

In [3]:
df = pd.read_csv('reranker_only_answer.csv')

In [4]:
from typing import List


def bleu(gt_answer: List[str], pred: str) -> float:
    return sacrebleu.sentence_bleu(pred, gt_answer).score

In [5]:
import re
import string
from collections import Counter


def _normalize_answer(s):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    Lower text and remove punctuation, articles and extra whitespace.
    """

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def _token_f1_score(prediction, ground_truth):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    """
    prediction_tokens = _normalize_answer(prediction).split()
    ground_truth_tokens = _normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [6]:
from typing import List


def KF1(gt_passages: List[str], pred: str) -> float:
    return _token_f1_score(pred, "\n".join(gt_passages))

In [129]:
def corpus_cumfreq(corpus: List[str]):
    freq_dist = Counter(corpus)
    cumulative_freq_dist = {}
    cumulative_freq = 0
    for value, freq in sorted(freq_dist.items(), key=lambda item: item[1]):
        cumulative_freq += freq
        cumulative_freq_dist[value] = cumulative_freq
    return freq_dist, cumulative_freq_dist


def find_corpus(data: pd.DataFrame, doi: str) -> List[str]:
    paper = data.loc[data['id'] == doi]
    joined_strings = []
    for elem in paper['full_text'].values[0]['paragraphs']:
        joined_strings.append(" ".join(elem))
    return _normalize_answer(" ".join(joined_strings)).split()

In [45]:
def lower_half(cumfreqs, sentence):
    sentence_tokens = _normalize_answer(sentence).split()
    lower_half = list(cumfreqs.values())[-1] // 2
    rare_tokens = []
    for token in sentence_tokens:
        freq = cumfreqs.get(token, 0)
        if freq <= lower_half:
            rare_tokens.append(token)
    return " ".join(rare_tokens)

In [46]:
def rare_f1(corpus, pred, gt):
    freq_dist, cumfreqs = corpus_cumfreq(corpus)
    rare_pred = lower_half(cumfreqs, pred)
    rare_gt = lower_half(cumfreqs, gt)
    return _token_f1_score(rare_pred, rare_gt)

In [51]:
df.head()

Unnamed: 0,question,retrieval_gt,answer_gt,retrieval_gt_contents,answer_pred,passage_ids,passage_contents,passage_scores,Recall,F1_score,Precision,KF1,BLEU
0,What evaluation metric is used?,['1908.06083-8-Table10-1.png'],['F1 and Weighted-F1'],['Table 10: Results of experiments on the mult...,"\n Given two paper passages, what i...",['1908.06083-Baselines: Wikipedia Toxic Commen...,['We compare the two aforementioned models wit...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.02439,0.0
1,What dataset do they use?,['1910.11471-Proposed Methodology ::: Statisti...,['A parallel corpus where the source is an Eng...,['SMT techniques require a parallel corpus in ...,1. NLP research paper passages:\n M...,['1910.11471-Problem Description ::: NLP of st...,['Mihalcea R. et al. has achieved a variable s...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.5,0.705569
2,By how much do they outperform existing state-...,['1912.03010-EXPERIMENT ::: TedLium2-1'\n '191...,['10%'],"['As far as we know, our model is the best E2E...",\n Semantic Masking is a technique ...,['1912.03010-EXPERIMENT ::: Librispeech 960h-3...,"['As far as we know, our model is the best E2E...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.262997,0.0
3,By how much does their method outperform state...,['1905.10247-EXPERIMENTAL SETUP AND EVALUATION...,"['AE-HCN outperforms by 17%, AE-HCN-CNN outper...",['The result is shown in Table TABREF23 . Sinc...,\n [0]\n Question: By how mu...,"['1905.10247-CONCLUSION-0', '1905.10247-EXPERI...",['We proposed a novel OOD detection method tha...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.337875,0.544894
4,Is the baseline a non-heirarchical model like ...,['1905.06566-Results-0' '1905.06566-7-Table1-1...,['There were hierarchical and non-hierarchical...,['Our main results on the CNNDM dataset are sh...,"1. BERT is a non-heirarchical model, but Hiber...","['1905.06566-Results-0', '1905.06566-Document ...",['Our main results on the CNNDM dataset are sh...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.164623,0.279467


In [52]:
gold_answer = pd.read_csv('./reranker_only_answer_with_gold_answer.csv')

In [53]:
no_passage_answer = pd.read_csv('./reranker_only_no_passage_answer.csv')

In [54]:
gold_answer.columns

Index(['question', 'retrieval_gt', 'answer_gt', 'retrieval_gt_contents',
       'answer_pred', 'passage_ids', 'passage_contents', 'passage_scores',
       'Recall', 'F1_score', 'Precision', 'KF1', 'BLEU', 'gt_answer_bleu',
       'gt_answer_kf1', 'gt_passage_bleu', 'gt_passage_kf1', 'gold_answer',
       'gold_answer_bleu', 'gold_answer_kf1'],
      dtype='object')

In [55]:
no_passage_answer.columns

Index(['question', 'retrieval_gt', 'answer_gt', 'retrieval_gt_contents',
       'answer_pred', 'passage_ids', 'passage_contents', 'passage_scores',
       'Recall', 'F1_score', 'Precision', 'KF1', 'BLEU', 'no_passage_answers'],
      dtype='object')

In [59]:
df['gold_answer'] = gold_answer['gold_answer']
df['no_passage_answers'] = no_passage_answer['no_passage_answers']

In [72]:
import ast
df['token_f1'] = df.apply(lambda row: _token_f1_score(row['answer_pred'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['token_f1'].mean()

0.06046515312253118

In [74]:
df['gold_token_f1'] = df.apply(lambda row: _token_f1_score(row['gold_answer'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['gold_token_f1'].mean()

0.08481445375163638

In [75]:
df['no_passage_token_f1'] = df.apply(lambda row: _token_f1_score(row['no_passage_answers'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['no_passage_token_f1'].mean()

0.043273290721519204

In [76]:
df['gt_passage_token_f1'] = df.apply(lambda row: _token_f1_score(" ".join(ast.literal_eval(row['retrieval_gt_contents'])),
                                                                 ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['gt_passage_token_f1'].mean()

0.10310820360555838

In [86]:
from datasets import load_dataset

data = load_dataset("NomaDamas/qasper")['train'].to_pandas()
data = data.drop('__index_level_0__', axis=1)

In [99]:
def calc_rare_f1(row, pred, gt):
    doi = ast.literal_eval(row['retrieval_gt'])[0].split('-')[0]
    corpus = find_corpus(data, doi)
    return rare_f1(corpus, pred, gt)

In [130]:
df['rare_f1'] = df.apply(lambda row: calc_rare_f1(row, row['answer_pred'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['rare_f1'].mean()

0.03746015562332183

In [132]:
df['gold_rare_f1'] = df.apply(lambda row: calc_rare_f1(row, row['gold_answer'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['gold_rare_f1'].mean()

0.060582321281304086

In [134]:
df['no_passages_rare_f1'] = df.apply(lambda row: calc_rare_f1(row, row['no_passage_answers'], ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['no_passages_rare_f1'].mean()

0.017825029267544274

In [135]:
df['gt_passage_rare_f1'] = df.apply(lambda row: calc_rare_f1(row, " ".join(ast.literal_eval(row['retrieval_gt_contents'])),
                                                                           ast.literal_eval(row['answer_gt'])[0]), axis=1)
df['gt_passage_rare_f1'].mean()

0.08851641740441335

In [18]:
import ast

df['gt_answer_bleu'] = df.apply(
    lambda row: bleu(ast.literal_eval(row['answer_gt']), ast.literal_eval(row['answer_gt'])[0]), axis=1)

In [19]:
df['gt_answer_bleu'].mean()

100.00000000000003

In [20]:
df['gt_answer_kf1'] = df.apply(
    lambda row: KF1(ast.literal_eval(row['retrieval_gt_contents']), ast.literal_eval(row['answer_gt'])[0]), axis=1)

In [21]:
df['gt_answer_kf1'].mean()

0.10310820360555838

In [22]:
df['gt_passage_bleu'] = df.apply(
    lambda row: bleu(ast.literal_eval(row['answer_gt']), "\n".join(ast.literal_eval(row['retrieval_gt_contents']))),
    axis=1)

In [23]:
df['gt_passage_bleu'].mean()

2.1915072434824023

In [24]:
df['gt_passage_kf1'] = df.apply(lambda row: KF1(ast.literal_eval(row['retrieval_gt_contents']),
                                                "\n".join(ast.literal_eval(row['retrieval_gt_contents']))), axis=1)

In [25]:
df['gt_passage_kf1'].mean()

1.0

In [28]:
prompt = PromptTemplate.from_template(
    """
    Answer user’s question about NLP paper using given paper passages.

    Question: {question}

    Paper passages:
    {passages}

    Answer:
    """
)
runnable = prompt | VLLMOpenAI(model_name="meta-llama/Llama-2-7b-hf",
                               openai_api_base="https://8185-34-87-172-99.ngrok-free.app/v1",
                               openai_api_key="") | StrOutputParser()

In [30]:
df['gold_answer'] = df.apply(lambda row: runnable.invoke({
    "question": row['question'],
    "passages": "\n".join(ast.literal_eval(row['retrieval_gt_contents']))
}), axis=1)

In [31]:
df.head()

Unnamed: 0,question,retrieval_gt,answer_gt,retrieval_gt_contents,answer_pred,passage_ids,passage_contents,passage_scores,Recall,F1_score,Precision,KF1,BLEU,gt_answer_bleu,gt_answer_kf1,gt_passage_bleu,gt_passage_kf1,gold_answer
0,What evaluation metric is used?,['1908.06083-8-Table10-1.png'],['F1 and Weighted-F1'],['Table 10: Results of experiments on the mult...,"\n Given two paper passages, what i...",['1908.06083-Baselines: Wikipedia Toxic Commen...,['We compare the two aforementioned models wit...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.02439,0.0,100.0,0.040816,0.594312,1.0,\n '''\n if not os.path.isfile('data/answers...
1,What dataset do they use?,['1910.11471-Proposed Methodology ::: Statisti...,['A parallel corpus where the source is an Eng...,['SMT techniques require a parallel corpus in ...,1. NLP research paper passages:\n M...,['1910.11471-Problem Description ::: NLP of st...,['Mihalcea R. et al. has achieved a variable s...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.5,0.705569,100.0,0.376812,2.159419,1.0,"In their paper, they use a text-code parallel..."
2,By how much do they outperform existing state-...,['1912.03010-EXPERIMENT ::: TedLium2-1'\n '191...,['10%'],"['As far as we know, our model is the best E2E...",\n Semantic Masking is a technique ...,['1912.03010-EXPERIMENT ::: Librispeech 960h-3...,"['As far as we know, our model is the best E2E...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.262997,0.0,100.0,0.011628,0.187284,1.0,\n Semantic mask is complementary to specag...
3,By how much does their method outperform state...,['1905.10247-EXPERIMENTAL SETUP AND EVALUATION...,"['AE-HCN outperforms by 17%, AE-HCN-CNN outper...",['The result is shown in Table TABREF23 . Sinc...,\n [0]\n Question: By how mu...,"['1905.10247-CONCLUSION-0', '1905.10247-EXPERI...",['We proposed a novel OOD detection method tha...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.337875,0.544894,100.0,0.044843,0.459571,1.0,"1. Our method is a transfer learning method, i..."
4,Is the baseline a non-heirarchical model like ...,['1905.06566-Results-0' '1905.06566-7-Table1-1...,['There were hierarchical and non-hierarchical...,['Our main results on the CNNDM dataset are sh...,"1. BERT is a non-heirarchical model, but Hiber...","['1905.06566-Results-0', '1905.06566-Document ...",['Our main results on the CNNDM dataset are sh...,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0,0.164623,0.279467,100.0,0.025806,0.076119,1.0,"\n Yes, it's a non-hiearchical model. It's ..."


In [32]:
df['gold_answer_bleu'] = df.apply(lambda row: bleu(ast.literal_eval(row['answer_gt']), row['gold_answer']), axis=1)

In [33]:
df['gold_answer_bleu'].mean()

1.6198945152542585

In [36]:
df['gold_answer_kf1'] = df.apply(lambda row: KF1(ast.literal_eval(row['retrieval_gt_contents']), row['gold_answer']), axis=1)

In [37]:
df['gold_answer_kf1'].mean()

0.41099695089974964

In [38]:
df_no_passage = pd.read_csv('./reranker_only_no_passage_answer.csv')

In [41]:
df_no_passage['no_passage_bleu'] = df_no_passage.apply(
    lambda row: bleu(ast.literal_eval(row['answer_gt']), row['no_passage_answers']), axis=1)

In [42]:
df_no_passage['no_passage_bleu'].mean()

0.43429918078216445

In [43]:
df_no_passage['no_passage_kf1'] = df_no_passage.apply(
    lambda row: KF1(ast.literal_eval(row['retrieval_gt_contents']), row['no_passage_answers']), axis=1)

In [44]:
df_no_passage['no_passage_kf1'].mean()

0.12197724122866072

In [45]:
df.to_csv('reranker_only_answer_with_gold_answer.csv', index=False)