In [11]:
import string
import re
import json
import sys
sys.path.append("../")
from evaluate_utils import calculate_short_answer_EM, rouge, bleu, select_candidate

version="v0915"
multi_docs="top10"
src_granularity=256
granularity=128

def eval_short_answer_EM(dataset, chat_model, reference_format, split, search_engine, rewrite_method):
    output_dir = f"../html_data/{dataset}/{chat_model}/{search_engine}"

    if dataset in ["asqa", "nq", "eli5"]:
        #. fine trim ratio 2/3
        coarse_context_window = {"2k": "3k", "4k": "6k", "8k": "12k", "16k": "24k"}[context_window]
    else:
        #. fine trim ratio 1/2
        coarse_context_window = {"2k": "4k", "4k": "8k", "8k": "16k", "16k": "32k"}[context_window]
    if reference_format in ["html-trim", "fill-chunk"]:
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{rerank_model}-{dataset}-{split}.jsonl"
    elif reference_format == "tree-gen":
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{version}-{granularity}-{dataset}-{split}.jsonl"
    elif reference_format == "tree-rerank":
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{rerank_model}-{granularity}-{dataset}-{split}.jsonl"
    elif reference_format in ["chunk-rerank-tree-gen", "tree-rerank-tree-gen"]:
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{rerank_model}-{src_granularity}to{granularity}-{coarse_context_window}-{version}-{dataset}-{split}.jsonl"
    elif reference_format in ["llmlingua", "bgelargeen","e5-mistral"]:
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{dataset}-{split}.jsonl"
    else:
        output_file = f"{output_dir}/{chat_model}-{reference_format}-{rewrite_method}-{dataset}-{split}.jsonl"
    print(f"evaluating file {output_file}")
    
    try:
        data_lines = [json.loads(l) for l in open(output_file)]
        generated_answers = [data_line[f"{chat_model}_{reference_format}" ] for data_line in data_lines]
        if dataset == "eli5":
            #. eval long answer
            if "answer" in data_lines[0]:
                gold_answers=[data_line["answer"] for data_line in data_lines]
            else:
                gold_answers=[data_line["long_answers"] for data_line in data_lines]
                
            selected_gold_answers=[]
            for gen, gold in tqdm.tqdm(zip(generated_answers, gold_answers), total=len(generated_answers)):
                selected_gold_answers.append(select_candidate(gen, gold))
            rouge_result=rouge.compute(predictions=generated_answers, references=selected_gold_answers)
            rouge_result={k: round(v * 100, 2) for k, v in rouge_result.items()}
            
            bleu_result=bleu.compute(predictions=generated_answers, references=gold_answers)
            return {"rougeL": rouge_result["rougeL"], "bleu": bleu_result["bleu"]*100}
        
        if "answers" in data_lines[0]:
            answers = [data_line['answers'] for data_line in data_lines]
        elif "short_answers" in data_lines[0]:
            answers = [data_line['short_answers'] for data_line in data_lines]
        elif "answer" in data_lines[0]:
            answers = [data_line['answer'] for data_line in data_lines]
        else:
            raise NotImplementedError("answers not found in data_lines")
    
        
        exact_match = [calculate_short_answer_EM(generated_answer, gold_answers) for generated_answer, gold_answers in zip(generated_answers, answers)]
        hit1= round(sum([hit1["hit1"] for hit1 in exact_match])/len(exact_match)*100, 2),
        exact_match= round(sum([hit1["exact_match"] for hit1 in exact_match])/len(exact_match)*100, 2)
        print(f"chat_model: {chat_model}, reference_format: {reference_format}, dataset: {dataset}, split: {split}, hit1: {hit1}, exact_match: {exact_match}")
        return {
            "hit1": hit1,
            "exact_match": exact_match
        }
    except Exception as e:
        print(f"error evaluating file {output_file}, error: {e}")
        #  print stack trace
        import traceback
        traceback.print_exc()
        if dataset == "eli5":
            return {
                "rougeL": .0,
                "bleu": .0,
            }
        return {
            "exact_match": .0,
            "hit1": .0,
        }


In [12]:
import tqdm
#  eval all datasets
#  generate latex table report
context_window="4k"
reference_formats=["bm25", "bgelargeen", "e5-mistral", "llmlingua", "jinaai-reader", "tree-rerank-tree-gen"]
syn_names=["BM25", "BGE", "E5-Mistral", "LongLLMLingua", "JinaAI Reader", "HtmlRAG"]

# long context settings
# context_window="128k"
# reference_formats=["html", "raw-text", "markdown", "html-simple"]
# syn_names=["Vanilla HTML", "Raw Text", "Markdown", "HtmlRAG w/o Prune"]

datasets=["asqa", "hotpot-qa", "nq", "trivia-qa", "musique", "eli5"]

split="test"
search_engine="bing"
rewrite_method="slimplmqr"
rerank_model="bgelargeen"

import multiprocessing
res_list=multiprocessing.Manager().list([""]*len(datasets)*len(reference_formats))
processes = []

def append_res2markdown_table(lidx, *args, **kwargs):
    lidx=lidx
    res=eval_short_answer_EM(*args)
    if "hit1" in res:
        hit1, exact_match=res["hit1"], res["exact_match"]
        if isinstance(hit1, tuple):
            hit1=hit1[0]
        if isinstance(exact_match, tuple):
            exact_match=exact_match[0]
        if args[0] in ["hotpot-qa", "musique"]:
            res=f" {hit1} "
        else:
            res=f" {hit1} & {exact_match} "
    
        res_list[lidx]=res
    res=" & ".join([f"{v:.2f}" for k, v in res.items()])
    res_list[lidx]=res

pbar=tqdm.tqdm(total=len(datasets)*len(reference_formats))

chat_model=f"llama70b{context_window}"
# chat_model=f"llama8b{context_window}"

for i, dataset in enumerate(datasets):
    for j, reference_format in enumerate(reference_formats):
        lidx= i*len(reference_formats) + j
        p=multiprocessing.Process(target=append_res2markdown_table, args=(lidx, dataset, chat_model, reference_format, split, search_engine, rewrite_method))
        processes.append(p)
        p.start()
        pbar.update(1)
        if len(processes) >= 4:
            for p in processes:
                p.join()
            processes=[]
                
if processes:
    for p in processes:
        p.join()

pbar.close()

  0%|          | 0/36 [00:00<?, ?it/s]

evaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-asqa-test.jsonlevaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-asqa-test.jsonlevaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-bm25-slimplmqr-asqa-test.jsonl


evaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-asqa-test.jsonl
chat_model: llama70b4k, reference_format: llmlingua, dataset: asqa, split: test, hit1: (62.5,), exact_match: 27.74
chat_model: llama70b4k, reference_format: bm25, dataset: asqa, split: test, hit1: (49.5,), exact_match: 21.95
chat_model: llama70b4k, reference_format: bgelargeen, dataset: asqa, split: test, hit1: (68.0,), exact_match: 30.57
chat_model: llama70b4k, reference_format: e5-mistral, dataset: asqa, split: test, hit1: (63.0,), exact_match: 28.75


 14%|█▍        | 5/36 [00:02<00:17,  1.81it/s]

evaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-asqa-test.jsonl
evaluating file ../html_data/asqa/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-6k-v0915-asqa-test.jsonl
evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-bm25-slimplmqr-hotpot-qa-test.jsonl
evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-hotpot-qa-test.jsonl
chat_model: llama70b4k, reference_format: tree-rerank-tree-gen, dataset: asqa, split: test, hit1: (68.5,), exact_match: 30.53
chat_model: llama70b4k, reference_format: jinaai-reader, dataset: asqa, split: test, hit1: (55.25,), exact_match: 23.73
chat_model: llama70b4k, reference_format: bm25, dataset: hotpot-qa, split: test, hit1: (38.25,), exact_match: 38.25
chat_model: llama70b4k, reference_format: bgelargeen, dataset: hotpot-qa, split: test, hit1: (41.75,), exact_match: 41.75


 25%|██▌       | 9/36 [00:05<00:17,  1.53it/s]

evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-hotpot-qa-test.jsonl
evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-hotpot-qa-test.jsonl
evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-hotpot-qa-test.jsonl
evaluating file ../html_data/hotpot-qa/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-8k-v0915-hotpot-qa-test.jsonl
chat_model: llama70b4k, reference_format: tree-rerank-tree-gen, dataset: hotpot-qa, split: test, hit1: (45.25,), exact_match: 45.25
chat_model: llama70b4k, reference_format: jinaai-reader, dataset: hotpot-qa, split: test, hit1: (34.25,), exact_match: 34.25
chat_model: llama70b4k, reference_format: llmlingua, dataset: hotpot-qa, split: test, hit1: (45.0,), exact_match: 45.0
chat_model: llama70b4k, reference_format: e5-mistral, dataset: hotpot-qa, split: test, hit1: (36.75,), exact_match: 36.75


 36%|███▌      | 13/36 [00:08<00:15,  1.46it/s]

evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-bm25-slimplmqr-nq-test.jsonl
evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-nq-test.jsonl
evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-nq-test.jsonl
evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-nq-test.jsonl
chat_model: llama70b4k, reference_format: llmlingua, dataset: nq, split: test, hit1: (56.75,), exact_match: 42.89
chat_model: llama70b4k, reference_format: bm25, dataset: nq, split: test, hit1: (47.0,), exact_match: 35.56
chat_model: llama70b4k, reference_format: e5-mistral, dataset: nq, split: test, hit1: (59.5,), exact_match: 44.07
chat_model: llama70b4k, reference_format: bgelargeen, dataset: nq, split: test, hit1: (59.5,), exact_match: 45.05


 47%|████▋     | 17/36 [00:11<00:12,  1.50it/s]

evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-nq-test.jsonl
evaluating file ../html_data/nq/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-6k-v0915-nq-test.jsonl
evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-bm25-slimplmqr-trivia-qa-test.jsonl
evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-trivia-qa-test.jsonl
chat_model: llama70b4k, reference_format: tree-rerank-tree-gen, dataset: nq, split: test, hit1: (60.75,), exact_match: 45.51
chat_model: llama70b4k, reference_format: bm25, dataset: trivia-qa, split: test, hit1: (88.0,), exact_match: 25.63
chat_model: llama70b4k, reference_format: jinaai-reader, dataset: nq, split: test, hit1: (48.25,), exact_match: 35.4
chat_model: llama70b4k, reference_format: bgelargeen, dataset: trivia-qa, split: test, hit1: (93.0,), exact_match: 27.04


 58%|█████▊    | 21/36 [00:13<00:09,  1.60it/s]

evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-trivia-qa-test.jsonl
evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-trivia-qa-test.jsonl
evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-trivia-qa-test.jsonl
evaluating file ../html_data/trivia-qa/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-8k-v0915-trivia-qa-test.jsonl
chat_model: llama70b4k, reference_format: llmlingua, dataset: trivia-qa, split: test, hit1: (92.5,), exact_match: 27.23
chat_model: llama70b4k, reference_format: jinaai-reader, dataset: trivia-qa, split: test, hit1: (90.0,), exact_match: 25.35
chat_model: llama70b4k, reference_format: tree-rerank-tree-gen, dataset: trivia-qa, split: test, hit1: (93.75,), exact_match: 27.07
chat_model: llama70b4k, reference_format: e5-mistral, dataset: trivia-qa, split: test, hit1: (90.75,), exact_match: 26.27


 69%|██████▉   | 25/36 [00:15<00:06,  1.66it/s]

evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-bm25-slimplmqr-musique-test.jsonl
evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-musique-test.jsonl
evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-musique-test.jsonl
evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-musique-test.jsonl
chat_model: llama70b4k, reference_format: llmlingua, dataset: musique, split: test, hit1: (10.25,), exact_match: 10.25
chat_model: llama70b4k, reference_format: bm25, dataset: musique, split: test, hit1: (9.5,), exact_match: 9.5
chat_model: llama70b4k, reference_format: bgelargeen, dataset: musique, split: test, hit1: (12.5,), exact_match: 12.5
chat_model: llama70b4k, reference_format: e5-mistral, dataset: musique, split: test, hit1: (11.0,), exact_match: 11.0


 81%|████████  | 29/36 [00:19<00:04,  1.42it/s]

evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-musique-test.jsonl
evaluating file ../html_data/musique/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-8k-v0915-musique-test.jsonl
evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-bm25-slimplmqr-eli5-test.jsonl
evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-bgelargeen-slimplmqr-eli5-test.jsonl
chat_model: llama70b4k, reference_format: tree-rerank-tree-gen, dataset: musique, split: test, hit1: (11.75,), exact_match: 11.75
chat_model: llama70b4k, reference_format: jinaai-reader, dataset: musique, split: test, hit1: (9.25,), exact_match: 9.25



  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 1/400 [00:01<08:15,  1.24s/it][A
  0%|          | 2/400 [00:01<05:48,  1.14it/s][A
  0%|          | 0/400 [00:00<?, ?it/s][A
  1%|          | 3/400 [00:02<04:59,  1.33it/s][A
  1%|          | 4/400 [00:03<04:56,  1.34it/s][A
  1%|▏         | 5/400 [00:04<05:53,  1.12it/s][A
  0%|          | 1/400 [00:03<25:15,  3.80s/it][A
  0%|          | 2/400 [00:04<12:55,  1.95s/it][A
  1%|          | 3/400 [00:05<08:46,  1.33s/it][A
  1%|          | 4/400 [00:05<07:14,  1.10s/it][A
  1%|▏         | 5/400 [00:06<07:23,  1.12s/it][A
  2%|▏         | 6/400 [00:08<08:24,  1.28s/it][A
  2%|▏         | 7/400 [00:09<07:12,  1.10s/it][A
  2%|▏         | 8/400 [00:09<06:13,  1.05it/s][A
  2%|▏         | 9/400 [00:10<04:58,  1.31it/s][A
  2%|▎         | 10/400 [00:12<07:11,  1.11s/it][A
  2%|▏         | 6/400 [00:14<26:02,  3.97s/it][A
  3%|▎         | 11/400 [00:13<06:55,  1.07s/it][A
  2%|▏         | 7/400 [00:15<20:49,  3.18s/

evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-e5-mistral-slimplmqr-eli5-test.jsonl
evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-llmlingua-slimplmqr-eli5-test.jsonl
evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-jinaai-reader-slimplmqr-eli5-test.jsonl
evaluating file ../html_data/eli5/llama70b4k/bing/llama70b4k-tree-rerank-tree-gen-slimplmqr-bgelargeen-256to128-6k-v0915-eli5-test.jsonl



  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 1/400 [00:01<09:23,  1.41s/it][A
  0%|          | 2/400 [00:02<06:18,  1.05it/s][A
  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 1/400 [00:02<14:34,  2.19s/it][A
  0%|          | 1/400 [00:03<23:17,  3.50s/it][A
  0%|          | 2/400 [00:03<11:20,  1.71s/it][A
  0%|          | 1/400 [00:02<18:46,  2.82s/it][A
  0%|          | 2/400 [00:03<11:29,  1.73s/it][A
  1%|          | 3/400 [00:07<19:48,  2.99s/it][A
  1%|          | 4/400 [00:08<13:43,  2.08s/it][A
  1%|          | 3/400 [00:07<17:07,  2.59s/it][A
  1%|▏         | 5/400 [00:09<11:27,  1.74s/it][A
  0%|          | 2/400 [00:08<28:13,  4.25s/it][A
  1%|          | 3/400 [00:06<14:29,  2.19s/it][A
  2%|▏         | 6/400 [00:10<10:57,  1.67s/it][A
  2%|▏         | 7/400 [00:11<09:06,  1.39s/it][A
  1%|          | 4/400 [00:11<21:33,  3.27s/it][A
  1%|▏     

In [13]:
# create a latex table
import re
latex_table = ["Dataset & EM & Hit@1 & EM & EM & Hit@1 & EM & Hit@1 & EM & ROUGE-L & BLEU"]

longest_syn_name = max([len(syn_name) for syn_name in syn_names]) +2
for i in range(len(reference_formats)):
    latex_table.append(f"{syn_names[i]}"+" "*(longest_syn_name-len(syn_names[i])) + "&")
    
for i, dataset in enumerate(datasets):
    for j, reference_format in enumerate(reference_formats):
        lidx= i*len(reference_formats) + j
        latex_table[j+1] += f"{res_list[lidx]} &"
        #. replace .x with .x0, e.g. 5.5 with 5.50
        latex_table[j+1]=re.sub(r"(\d+\.\d)(?!\d)", r"\g<1>0", latex_table[j+1])
        

for line in latex_table:
    if line.endswith("&"):
        line=line[:-1]
    line += "\\\\"
    print(line)

Dataset & EM & Hit@1 & EM & EM & Hit@1 & EM & Hit@1 & EM & ROUGE-L & BLEU\\
BM25           & 49.50 & 21.95  & 38.25  & 47.00 & 35.56  & 88.00 & 25.63  & 9.50  &28.13 & 4.90 & 16.15 & 17.97 & 0.07 \\
BGE            & 68.00 & 30.57  & 41.75  & 59.50 & 45.05  & 93.00 & 27.04  & 12.50  &28.71 & 5.10 & 16.20 & 18.52 & 0.07 \\
E5-Mistral     & 63.00 & 28.75  & 36.75  & 59.50 & 44.07  & 90.75 & 26.27  & 11.00  &28.03 & 5.08 & 16.17 & 17.78 & 0.07 \\
LongLLMLingua  & 62.50 & 27.74  & 45.00  & 56.75 & 42.89  & 92.50 & 27.23  & 10.25  &28.31 & 4.96 & 15.84 & 18.27 & 0.06 \\
JinaAI Reader  & 55.25 & 23.73  & 34.25  & 48.25 & 35.40  & 90.00 & 25.35  & 9.25  &27.89 & 5.02 & 16.05 & 17.83 & 0.06 \\
HTML4RAG       & 68.50 & 30.53  & 45.25  & 60.75 & 45.51  & 93.75 & 27.07  & 11.75  &27.98 & 4.97 & 16.32 & 18.02 & 0.07 \\


In [14]:
#. create a markdown table
markdown_table = ["| Dataset | EM | Hit@1 | EM | EM | Hit@1 | EM | Hit@1 | EM | ROUGE-L | BLEU |",
                  "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"]

longest_syn_name = max([len(syn_name) for syn_name in syn_names]) +2
for i in range(len(reference_formats)):
    markdown_table.append(f"| {syn_names[i]}"+" "*(longest_syn_name-len(syn_names[i])) + "|")
    
for i, dataset in enumerate(datasets):
    for j, reference_format in enumerate(reference_formats):
        lidx= i*len(reference_formats) + j
        markdown_table[j+2] += f"{res_list[lidx]} |"
        #. replace .x with .x0, e.g. 5.5 with 5.50
        markdown_table[j+2]=re.sub(r"(\d+\.\d)(?!\d)", r"\g<1>0", markdown_table[j+2])
        
#. replace & with |
for i, line in enumerate(markdown_table):
    markdown_table[i]=line.replace("&", "|")
        
for line in markdown_table:
    print(line)
    

| Dataset | EM | Hit@1 | EM | EM | Hit@1 | EM | Hit@1 | EM | ROUGE-L | BLEU |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| BM25           | 49.50 | 21.95  | 38.25  | 47.00 | 35.56  | 88.00 | 25.63  | 9.50  |28.13 | 4.90 | 16.15 | 17.97 | 0.07 |
| BGE            | 68.00 | 30.57  | 41.75  | 59.50 | 45.05  | 93.00 | 27.04  | 12.50  |28.71 | 5.10 | 16.20 | 18.52 | 0.07 |
| E5-Mistral     | 63.00 | 28.75  | 36.75  | 59.50 | 44.07  | 90.75 | 26.27  | 11.00  |28.03 | 5.08 | 16.17 | 17.78 | 0.07 |
| LongLLMLingua  | 62.50 | 27.74  | 45.00  | 56.75 | 42.89  | 92.50 | 27.23  | 10.25  |28.31 | 4.96 | 15.84 | 18.27 | 0.06 |
| JinaAI Reader  | 55.25 | 23.73  | 34.25  | 48.25 | 35.40  | 90.00 | 25.35  | 9.25  |27.89 | 5.02 | 16.05 | 17.83 | 0.06 |
| HTML4RAG       | 68.50 | 30.53  | 45.25  | 60.75 | 45.51  | 93.75 | 27.07  | 11.75  |27.98 | 4.97 | 16.32 | 18.02 | 0.07 |
