## Calculate Longbench Scores

In [3]:
import glob
import json

import pandas as pd
from IPython.display import display

model = ["llama_3.1_70b_answer", "mistral7b_answer", "command_r_plus_answer", "gpt_4o_answer", "gpt_4o_mini_answer"][1]
task_types = {
    "singledoc_qa": ["multifieldqa_en", "multifieldqa_zh", "narrativeqa", "qasper"],
    "multidoc_qa": ["hotpotqa", "2wikimqa", "musique", "dureader"],
    "summarization": ["gov_report", "qmsum", "multi_news", "vcsum"],
    "fewshot": ["triviaqa", "samsum", "trec", "lsht"],
    "synthetic": ["passage_count", "passage_retrieval_en", "passage_retrieval_zh"],
    "code": ["lcc", "repobench-p"],
}
zh_tasks = ["dureader", "multifieldqa_zh", "vcsum", "lsht", "passage_retrieval_zh"]
for metrics_path in glob.glob("../results/longbench/**/metrics_*.json", recursive=True):
    if model not in metrics_path or "old" in metrics_path:
        continue
    print(f'-- Metrics for: {" - ".join(metrics_path.split("/")[3:])}')
    res: dict = json.load(open(metrics_path, "r"))
    df = pd.DataFrame(columns=task_types.keys(), index=["en", "zh"])
    for task_type, tasks in task_types.items():
        if not any(task in res for task in tasks):
            continue
        t = [task for task in tasks if task not in zh_tasks]
        task_avg = sum(res[task]["score"] for task in t) / len(t)
        df.loc["en", task_type] = task_avg
        t_zh = [task for task in tasks if task in zh_tasks]
        if t_zh:
            task_avg_zh = sum(res[task]["score"] for task in t_zh) / len(t_zh)
            df.loc["zh", task_type] = task_avg_zh
    df["avg"] = df.mean(axis=1)
    display(df)

-- Metrics for: zero_shot - mistral7b_answer - metrics_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,8.896667,21.913333,6.243333,39.283333,1.895,44.05,20.380278
zh,18.38,9.96,0.0,6.75,4.67,,7.952


-- Metrics for: llmlingua2 - mistral7b_answer - metrics_target3000_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,27.07,28.08,21.38,48.71,5.335,50.855,30.238333
zh,31.66,9.68,2.6,13.0,12.0,,13.788


-- Metrics for: llmlingua2 - mistral7b_answer - metrics_target2000_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,25.316667,29.28,21.206667,44.286667,4.335,45.63,28.3425
zh,26.01,10.32,2.31,9.0,9.5,,11.428


-- Metrics for: origin - mistral7b_answer - metrics_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,24.13,27.943333,19.843333,62.836667,8.605,67.1,35.076389
zh,31.72,10.45,7.92,23.42,10.22,,16.746


-- Metrics for: llmlingua2_small - mistral7b_answer - metrics_target3000_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,25.006667,30.07,20.846667,46.056667,10.91,51.99,30.813333
zh,25.67,9.83,7.11,6.25,12.75,,12.322


-- Metrics for: llmlingua2_small - mistral7b_answer - metrics_target2000_longbench_test_formatted.json


Unnamed: 0,singledoc_qa,multidoc_qa,summarization,fewshot,synthetic,code,avg
en,22.823333,29.583333,20.69,41.743333,8.585,47.335,28.46
zh,28.28,10.27,1.81,6.25,12.75,,11.872


In [None]:
## Fix longbench code category scores (after installation of python-Levenshtein)
# Calculate per-programming-language scores

import glob
import json
import os
from collections import defaultdict

from metrics import code_sim_score


def scorer(dataset, predictions, answers, langs, all_classes):
    total_score, langs_scores = 0.0, defaultdict(list)
    for prediction, ground_truths, lang in zip(predictions, answers, langs):
        score = 0.0
        for ground_truth in ground_truths:
            score = max(score, code_sim_score(prediction, ground_truth, all_classes=all_classes))
        total_score += score
        langs_scores[lang].append(score)
    return {
        "total": round(100 * total_score / len(predictions), 2),
        "langs": {lang: round(100 * sum(scores) / len(scores), 2) for lang, scores in langs_scores.items()},
    }


def eval(load_path):
    results: dict[str, dict] = json.load(open(load_path))
    predictions, answers, langs = defaultdict(list), defaultdict(list), defaultdict(list)
    all_classes = {}
    for data in results.values():
        if data["task"] not in ["repobench-p", "lcc"]:
            continue
        predictions[data["task"]].append(data["pred"])
        answers[data["task"]].append(data["answers"])
        langs[data["task"]].append(lb_data[data["idx"]]["language"])
        all_classes[data["task"]] = data["all_classes"]
    scores = {}
    for task in predictions.keys():
        pred_list, ans_list, lang_list = predictions[task], answers[task], langs[task]
        score_dict = scorer(task, pred_list, ans_list, lang_list, all_classes[task])
        print(f'Task: {task} - {score_dict["total"]}')
        for lang, lang_score in score_dict["langs"].items():
            print(f"  {lang}: {lang_score}")
        scores[task] = {"score": score_dict["total"], "num": len(pred_list), "per_lang": score_dict["langs"]}
    score_list = [s["score"] for s in scores.values()]
    scores["avg"] = sum(score_list) / len(score_list)
    return scores


lb_data = json.load(open("../results/longbench/origin/longbench_test_formatted.json"))
for answer_file in glob.glob("../results/longbench/**/answer_*.json", recursive=True):
    metrics_file = answer_file.replace("answer_", "metrics_")
    if "old" in answer_file or not os.path.exists(metrics_file):
        continue
    print(f'-- Metrics for: {" - ".join(answer_file.split("/")[3:])}')
    res = eval(answer_file)
    metrics = json.load(open(metrics_file))
    metrics["lcc"] = res["lcc"]
    metrics["repobench-p"] = res["repobench-p"]
    json.dump(metrics, open(metrics_file, "w"), indent=4)

## Calculate average prompt lengths (MeetingBank, LongBench, ZeroScrolls)

In [1]:
dataset2prompt = {
    "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
    "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
    "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
    "multifieldqa_zh": "阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：",
    "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
    "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
    "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
    "dureader": "请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：",
    "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
    "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
    "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
    "vcsum": "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：",
    "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
    "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
    "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
    "lsht": "请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}",
    "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
    "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
    "passage_retrieval_zh": '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
    "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
    "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
}


def build_prompt_longbench(sample: object, load_key: str):
    new_sample = {"input": sample["question"]}
    if load_key is not None:
        new_sample["context"] = sample[load_key]
    prompt_format = dataset2prompt[sample["task"]]
    return prompt_format.format(
        context=new_sample["context"] if "context" in new_sample else "", input=new_sample["input"]
    )

In [2]:
import glob
import json
import os
from collections import defaultdict

import tiktoken

# from reproduction.eval_longbench import build_prompt

tokenizer = tiktoken.encoding_for_model("gpt-3.5")
benchmarks = [
    "longbench",
    "meetingbank_short",
    "zero_scrolls",
]
all_dict = defaultdict(dict)
all_num_samples_dict = defaultdict(dict)
for benchmark in benchmarks:
    print(f'\n====== {benchmark.replace("_", " ").title()} ======')
    for scenario in glob.glob(f"../results/{benchmark}/*/*.json"):
        model, target = os.path.dirname(scenario).split("/")[-1], os.path.basename(scenario).split("_")[1]
        compression = model in ["llmlingua2", "llmlingua2_small"]
        data = json.load(open(scenario, "r"))
        if isinstance(data, dict):
            data = list(data.values())
        load_key = (
            None
            if model == "zero_shot"
            else "compressed_prompt" if compression else "context" if benchmark == "longbench" else "prompt"
        )
        if benchmark == "longbench":
            lens, lens_zh = defaultdict(int), defaultdict(int)
            samples_per_task, samples_per_task_zh = defaultdict(int), defaultdict(int)
            en_tasks = ["multifieldqa_en", "narrativeqa", "qasper", "hotpotqa", "2wikimqa", "musique", "gov_report", "qmsum", "multi_news", "triviaqa", "samsum", "trec", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"]
            zh_tasks = ["dureader", "multifieldqa_zh", "vcsum", "lsht", "passage_retrieval_zh"]
            data_zh = [d for d in data if d["task"] in zh_tasks]
            data = [d for d in data if d["task"] in en_tasks]
            for d in data:
                lens[d["task"]] += len(tokenizer.encode(build_prompt_longbench(d, load_key)))
                samples_per_task[d["task"]] += 1
            for d in data_zh:
                lens_zh[d["task"]] += len(tokenizer.encode(build_prompt_longbench(d, load_key)))
                samples_per_task_zh[d["task"]] += 1
            avg_len = sum(lens.values()) / len(data)
            avg_len_zh = sum(lens_zh.values()) / len(data_zh)
            print(f'\n# {model} - {target + " - " if compression else ""}{avg_len} - ZH: {avg_len_zh}')
            print("EN tasks:")
            for task, length in lens.items():
                print(f'{task}: {length / samples_per_task[task]}')
            print("\nZH tasks:")
            for task, length in lens_zh.items():
                print(f'{task}: {length / samples_per_task_zh[task]}')
            all_dict[benchmark][scenario] = lens | lens_zh
            all_num_samples_dict[benchmark][scenario] = samples_per_task | samples_per_task_zh
        elif benchmark == "zero_scrolls":
            lens, samples_per_task = defaultdict(int), defaultdict(int)
            for d in data:
                lens[d["task"]] += len(tokenizer.encode(d[load_key]))
                samples_per_task[d["task"]] += 1
            avg_len = sum(lens.values()) / len(data)
            print(f'\n# {model} - {target + " - " if compression else ""}{avg_len}')
            for task, length in lens.items():
                print(f'{task}: {length / samples_per_task[task]}')
            all_dict[benchmark][scenario] = lens
            all_num_samples_dict[benchmark][scenario] = samples_per_task
        else:
            avg_len = sum(len(tokenizer.encode(d[load_key])) for d in data) / len(data)
            print(f'# {model} - {target + " - " if compression else ""}{avg_len}')



# zero_shot - 238.6088 - ZH: 285.263
EN tasks:
narrativeqa: 93.01
qasper: 167.45
multifieldqa_en: 50.32666666666667
hotpotqa: 73.53
2wikimqa: 70.85
musique: 72.585
gov_report: 34.0
qmsum: 61.665
multi_news: 33.0
trec: 28.0
triviaqa: 743.6
samsum: 167.32
passage_count: 90.0
passage_retrieval_en: 238.885
lcc: 13.0
repobench-p: 1011.91

ZH tasks:
multifieldqa_zh: 80.49
dureader: 48.735
vcsum: 38.0
lsht: 995.145
passage_retrieval_zh: 263.945

# llmlingua2 - target3000 - 3138.2674666666667 - ZH: 2881.051
EN tasks:
narrativeqa: 3194.36
qasper: 3148.355
multifieldqa_en: 2966.6466666666665
hotpotqa: 3320.05
2wikimqa: 3115.59
musique: 3392.595
gov_report: 3127.035
qmsum: 2887.355
multi_news: 2051.835
trec: 2710.135
triviaqa: 3807.375
samsum: 2966.815
passage_count: 3399.055
passage_retrieval_en: 3514.295
lcc: 2278.276
repobench-p: 4114.796

ZH tasks:
multifieldqa_zh: 2801.23
dureader: 2639.245
vcsum: 2522.745
lsht: 3411.495
passage_retrieval_zh: 3030.54

# llmlingua2 - target2000 - 2277.80666

In [None]:
import re

import pandas as pd

benchmarks = [
    "longbench",
    "zero_scrolls",
]
lb_categories = {
    "Single-doc QA": ["multifieldqa_en", "multifieldqa_zh", "narrativeqa", "qasper"],
    "Multi-doc QA": ["hotpotqa", "2wikimqa", "musique", "dureader"],
    "Summarization": ["gov_report", "qmsum", "multi_news", "vcsum"],
    "Few-Shot": ["triviaqa", "samsum", "trec", "lsht"],
    "Synthetic": ["passage_count", "passage_retrieval_en", "passage_retrieval_zh"],
    "Code": ["lcc", "repobench-p"],
}
lb_tasknames = {
    "narrativeqa": "NarrativeQA",
    "qasper": "Qasper",
    "multifieldqa_en": "MultiFieldQA (EN)",
    "multifieldqa_zh": "MultiFieldQA (ZH)",
    "hotpotqa": "HotpotQA",
    "2wikimqa": "2WikiMultihopQA",
    "musique": "MuSiQue",
    "dureader": "DuReader (ZH)",
    "gov_report": "GovReport",
    "qmsum": "QMSum",
    "multi_news": "MultiNews",
    "vcsum": "VCSUM (ZH)",
    "trec": "TREC",
    "triviaqa": "TriviaQA",
    "samsum": "SAMSum",
    "lsht": "LSHT (ZH)",
    "passage_retrieval_en": "PassageRetrieval (EN)",
    "passage_count": "PassageCount",
    "passage_retrieval_zh": "PassageRetrieval (ZH)",
    "lcc": "LCC",
    "repobench-p": "RepoBench-P",
    "avg": "Average",
}
zero_scrolls_tasknames = {
    "gov_report": "GovReport",
    "summ_screen_fd": "SummScreenFD",
    "qmsum": "QMSum",
    "qasper": "Qasper",
    "narrative_qa": "NarrativeQA",
    "quality": "QuALITY",
    "musique": "MuSiQue",
    "squality": "SQuALITY",
    "space_digest": "SpaceDigest",
    "book_sum_sort": "BookSumSort",
    "avg": "Average",
}
zero_scrolls_categories = {
    "Summarization": ["gov_report", "qmsum", "summ_screen_fd", "squality"],
    "QA": ["qasper", "narrative_qa", "musique", "quality"],
    "Data Processing": ["space_digest", "book_sum_sort"],
}
scen = [
    "origin/",
    "llmlingua2/compression_target3000",
    "llmlingua2_small/compression_target3000",
    "llmlingua2/compression_target2000",
    "llmlingua2_small/compression_target2000",
]
benchmarks = {
    "longbench": {
        "tasks": [t for ts in lb_categories.values() for t in ts],
        "tasknames": lb_tasknames,
        "key": "score",
    },
    "zero_scrolls": {
        "tasks": [t for ts in zero_scrolls_categories.values() for t in ts],
        "tasknames": zero_scrolls_tasknames,
        "key": "zero_scrolls_score",
    },
}

for benchmark in benchmarks:
    outdir = f"../../ma-thesis/tables/{benchmark}"
    os.makedirs(outdir, exist_ok=True)

    tasks = benchmarks[benchmark]["tasks"]
    tasknames = benchmarks[benchmark]["tasknames"]
    num_samples = list(all_num_samples_dict[benchmark].values())[0]

    columns = [("Original", "")]
    for target in ["Target 3000", "Target 2000"]:
        columns.append((target, "LLMLingua-2"))
        columns.append((target, "Ratio"))
        columns.append((target, "LLMLingua-2-small"))
        columns.append((target, "Ratio-small"))

    df = pd.DataFrame(index=tasks, columns=pd.MultiIndex.from_tuples(columns))

    for s, col in zip(
        scen,
        [
            ("Original", ""),
            ("Target 3000", "LLMLingua-2"),
            ("Target 3000", "LLMLingua-2-small"),
            ("Target 2000", "LLMLingua-2"),
            ("Target 2000", "LLMLingua-2-small"),
        ],
    ):
        lens = [sc for k, sc in all_dict[benchmark].items() if s in k][0]
        for task in tasks:
            df.loc[task, col] = lens[task] / num_samples[task]
        df.loc["avg", col] = sum(lens.values()) / sum(num_samples.values())
        if col[0] != "Original":
            df.loc[:, (col[0], "Ratio" if "-small" not in col[1] else "Ratio-small")] = (
                df.loc[:, ("Original", "")] / df.loc[:, col]
            )

    df.index = df.index.map(lambda x: tasknames[x])
    for col in df.columns:
        if col in [
            ("Target 3000", "Ratio"),
            ("Target 3000", "Ratio-small"),
            ("Target 2000", "Ratio"),
            ("Target 2000", "Ratio-small"),
        ]:
            df[col] = df[col].map(lambda x: f"{x:.1f}x" if isinstance(x, float) else x)
        else:
            df[col] = df[col].map(lambda x: f"{x:,.0f}" if isinstance(x, float) else x)
    df.columns = pd.MultiIndex.from_tuples(
        [
            (
                ("Target 3000", "Ratio")
                if col == ("Target 3000", "Ratio-small")
                else ("Target 2000", "Ratio") if col == ("Target 2000", "Ratio-small") else col
            )
            for col in df.columns
        ]
    )
    tex_table = df.to_latex(float_format="%.0f", multicolumn_format="c|", column_format="@{}l|c|cccc|cccc@{}")

    tex_table = tex_table.replace("|}{Target 2", "}{Target 2")
    tex_table = tex_table.replace("Average", "\\midrule Average")
    for match in ["Original", "Target 3000", "Target 2000"]:
        tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")
    pattern = r"\\midrule Average & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) \\\\"
    repl = r"\\midrule \\textbf{Average} & \\textbf{\1} & \\textbf{\2} & \\textbf{\3} & \\textbf{\4} & \\textbf{\5} & \\textbf{\6} & \\textbf{\7} & \\textbf{\8} & \\textbf{\9} \\\\"
    tex_table = re.sub(pattern, repl, tex_table)

    with open(f"{outdir}/lengths.tex", "w") as f:
        f.write(tex_table)

    display(df)

In [None]:
# LongBench SingleDoc only (mistral 7b)
import json
import tiktoken
import glob
import os

tokenizer = tiktoken.encoding_for_model("gpt-3.5")
print("\n====== LongBench SingleDoc Only (Mistral 7B) ======")
for scenario in glob.glob(f"../results/longbench/*/*.json"):
    model, target = os.path.dirname(scenario).split("/")[-1], os.path.basename(scenario).split("_")[1]
    compression = model in ["llmlingua2", "llmlingua2_small"]
    data = json.load(open(scenario, "r"))
    if isinstance(data, dict):
        data = list(data.values())
    load_key = None if model == "zero_shot" else "compressed_prompt" if compression else "context"
    single_doc_tasks = ["multifieldqa_en", "narrativeqa", "qasper"]
    single_doc_tasks_zh = ["multifieldqa_zh"]
    data_en = [d for d in data if d["task"] in single_doc_tasks]
    data_zh = [d for d in data if d["task"] in single_doc_tasks_zh]
    avg_len = sum(len(tokenizer.encode(build_prompt_longbench(d, load_key))) for d in data_en) / len(data_en)
    avg_len_zh = sum(len(tokenizer.encode(build_prompt_longbench(d, load_key))) for d in data_zh) / len(data_zh)
    print(f'{model} - {target + " - " if compression else ""}{avg_len} - ZH: {avg_len_zh}')


origin - 14619.258181818182 - ZH: 7355.45
zero_shot - 108.43818181818182 - ZH: 80.49
llmlingua2 - target2000 - 2223.6945454545453 - ZH: 1853.15
llmlingua2 - target3000 - 3115.5272727272727 - ZH: 2801.23
llmlingua2_small - target2000 - 2455.8654545454547 - ZH: 3270.4
llmlingua2_small - target3000 - 3252.6381818181817 - ZH: 4845.72


## Calculate average prompt lengths (GSM8K)

In [None]:
import glob
import json
import os

import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-3.5")
get_prompt_len = lambda dem_list: len(tokenizer.encode("\n\n".join(["Question: " + dem for dem in dem_list])))
print(f"\n====== GSM8K ======")
for scenario in glob.glob(f"../results/gsm8k/*/*gsm8k_cot_example*.json"):
    model, target = os.path.dirname(scenario).split("/")[-1], "_".join(os.path.basename(scenario).split("_")[1:-1])
    compression = model in ["llmlingua2", "llmlingua2_small"]
    data = json.load(open(scenario, "r"))
    if isinstance(data, dict):
        data = list(data.values())
    data = data[0]["compressed_prompt_list" if compression else "prompt_list"]
    print(f'{model} - {target + " - " if compression else ""}{get_prompt_len(data)}')


origin - 2366
llmlingua2 - target160_gsm8k_cot - 157
llmlingua2 - target400_gsm8k_cot - 440
llmlingua2_small - target160_gsm8k_cot - 159
llmlingua2_small - target400_gsm8k_cot - 455


## Calculate average prompt lengths (BBH)

In [1]:
import glob
import json
import os

import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-3.5")


def build_prompt_bbh(task, key):
    prompt = f"\n\n{task[key]}" if key is not None else ""
    return f"{task['instruction']}{prompt}\n\n"


print(f"\n====== BBH ======")
bbh_dict = {}
for scenario in glob.glob(f"../results/bbh/*/*bbh_cot_examples*.json"):
    if not "200" in scenario and not "300" in scenario and not "origin" in scenario:
        continue
    model, target = os.path.dirname(scenario).split("/")[-1], "_".join(os.path.basename(scenario).split("_")[1:])
    compression = model in ["llmlingua2", "llmlingua2_small"]
    load_key = None if model == "zero_shot" else "compressed_prompt" if compression else "prompt"
    data = json.load(open(scenario, "r"))
    if isinstance(data, dict):
        data = list(data.values())
    prompt_lens = {}
    for task in data:
        prompt_lens[task["task"]] = len(tokenizer.encode(build_prompt_bbh(task, load_key)))
    print(f'\n## {model} - {target + " - " if compression else ""}{sum(prompt_lens.values()) / len(prompt_lens)}')
    for task, p_len in prompt_lens.items():
        print(f"{task}: {p_len}")
    bbh_dict[scenario] = prompt_lens



## llmlingua2 - target200_bbh_cot_examples.json - 175.62962962962962
temporal_sequences: 183
disambiguation_qa: 179
date_understanding: 192
tracking_shuffled_objects_three_objects: 163
penguins_in_a_table: 172
geometric_shapes: 129
snarks: 251
ruin_names: 183
tracking_shuffled_objects_seven_objects: 163
tracking_shuffled_objects_five_objects: 163
logical_deduction_three_objects: 167
hyperbaton: 156
logical_deduction_five_objects: 167
logical_deduction_seven_objects: 167
movie_recommendation: 180
salient_translation_error_detection: 186
reasoning_about_colored_objects: 168
multistep_arithmetic_two: 180
navigate: 171
dyck_languages: 187
word_sorting: 173
sports_understanding: 165
boolean_expressions: 170
object_counting: 167
formal_fallacies: 205
causal_judgement: 176
web_of_lies: 179

## llmlingua2 - target300_bbh_cot_examples.json - 253.07407407407408
temporal_sequences: 277
disambiguation_qa: 264
date_understanding: 289
tracking_shuffled_objects_three_objects: 221
penguins_in_a_tabl

In [None]:
import re

import pandas as pd

outdir = f"../../ma-thesis/tables/bbh"
os.makedirs(outdir, exist_ok=True)

scen = [
    "origin/",
    "llmlingua2/compression_target300",
    "llmlingua2_small/compression_target300",
    "llmlingua2/compression_target200",
    "llmlingua2_small/compression_target200",
]
bbh_tasknames = {
    "multistep_arithmetic_two": "Multi-Step Arithmetic",
    "boolean_expressions": "Boolean Expressions",
    "logical_deduction_three_objects": "Logical Deduction (3 Obj.)",
    "logical_deduction_five_objects": "Logical Deduction (5 Obj.)",
    "logical_deduction_seven_objects": "Logical Deduction (7 Obj.)",
    "geometric_shapes": "Geometric Shapes",
    "dyck_languages": "Dyck Languages",
    "navigate": "Navigate",
    "temporal_sequences": "Temporal Sequences",
    "formal_fallacies": "Formal Fallacies",
    "object_counting": "Object Counting",
    "penguins_in_a_table": "Penguins in a Table",
    "tracking_shuffled_objects_three_objects": "Track. 3 Shuffled Obj.",
    "tracking_shuffled_objects_five_objects": "Track. 5 Shuffled Obj.",
    "tracking_shuffled_objects_seven_objects": "Track. 7 Shuffled Obj.",
    "reasoning_about_colored_objects": "Reasoning about Col. Obj.",
    "web_of_lies": "Web of Lies",
    "word_sorting": "Word Sorting",
    "disambiguation_qa": "Disambiguation QA",
    "hyperbaton": "Hyperbaton",
    "salient_translation_error_detection": "Salient Transl. Err. Detection",
    "snarks": "Snarks",
    "sports_understanding": "Sports Understanding",
    "movie_recommendation": "Movie Recommendation",
    "date_understanding": "Date Understanding",
    "causal_judgement": "Causal Judgement",
    "ruin_names": "Ruin Names",
}

columns = [("Original", "")]
for target in ["Target 300", "Target 200"]:
    columns.append((target, "LLMLingua-2"))
    columns.append((target, "Ratio"))
    columns.append((target, "LLMLingua-2-small"))
    columns.append((target, "Ratio-small"))

df = pd.DataFrame(index=bbh_tasknames.keys(), columns=pd.MultiIndex.from_tuples(columns))

for s, col in zip(
    scen,
    [
        ("Original", ""),
        ("Target 300", "LLMLingua-2"),
        ("Target 300", "LLMLingua-2-small"),
        ("Target 200", "LLMLingua-2"),
        ("Target 200", "LLMLingua-2-small"),
    ],
):
    lens = [sc for k, sc in bbh_dict.items() if s in k][0]
    for task in bbh_tasknames:
        df.loc[task, col] = lens[task]
    df.loc["avg", col] = sum(lens.values()) / len(lens)
    if col[0] != "Original":
        df.loc[:, (col[0], "Ratio" if "-small" not in col[1] else "Ratio-small")] = (
            df.loc[:, ("Original", "")] / df.loc[:, col]
        )

df.index = df.index.map(lambda x: bbh_tasknames[x] if x != "avg" else "Average")
for col in df.columns:
    if col in [
        ("Target 300", "Ratio"),
        ("Target 300", "Ratio-small"),
        ("Target 200", "Ratio"),
        ("Target 200", "Ratio-small"),
    ]:
        df[col] = df[col].map(lambda x: f"{x:.1f}x" if isinstance(x, float) else x)
    else:
        df[col] = df[col].map(lambda x: f"{x:,.0f}" if isinstance(x, int) else x)
df.columns = pd.MultiIndex.from_tuples(
    [
        (
            ("Target 300", "Ratio")
            if col == ("Target 300", "Ratio-small")
            else ("Target 200", "Ratio") if col == ("Target 200", "Ratio-small") else col
        )
        for col in df.columns
    ]
)
tex_table = df.to_latex(float_format="%.0f", multicolumn_format="c|", column_format="@{}l|c|cccc|cccc@{}")

tex_table = tex_table.replace("|}{Target 2", "}{Target 2")
tex_table = tex_table.replace("Average", "\\midrule Average")
for match in ["Original", "Target 300", "Target 200"]:
    tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")
pattern = r"\\midrule Average & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) & (.+) \\\\"
repl = r"\\midrule \\textbf{Average} & \\textbf{\1} & \\textbf{\2} & \\textbf{\3} & \\textbf{\4} & \\textbf{\5} & \\textbf{\6} & \\textbf{\7} & \\textbf{\8} & \\textbf{\9} \\\\"
tex_table = re.sub(pattern, repl, tex_table)

with open(f"{outdir}/lengths.tex", "w") as f:
    f.write(tex_table)

display(df)

## Calculate averge ZeroScrolls summarization scores

In [None]:
import glob
import json

import pandas as pd
from IPython.display import display

model = ["llama_3.1_70b_answer", "mistral7b_answer", "command_r_plus_answer"][0]
sum_tasks = ["gov_report", "summ_screen_fd", "qmsum"]
for metrics_path in glob.glob("../results/zero_scrolls/**/metrics_*.json", recursive=True):
    if model not in metrics_path:
        continue
    res: dict = json.load(open(metrics_path, "r"))
    avg_score = sum(res[task]["zero_scrolls_score"] for task in sum_tasks) / len(sum_tasks)
    print(f'-- Avg summarization score for: {" - ".join(metrics_path.split("/")[4:])}')
    print(avg_score)

-- Avg summarization score for: origin - llama_3.1_70b_answer - metrics_zero_scrolls_validation.json
21.2975
-- Avg summarization score for: llmlingua2 - llama_3.1_70b_answer - metrics_target2000_zero_scrolls_validation.json
17.619933333333336
-- Avg summarization score for: llmlingua2 - llama_3.1_70b_answer - metrics_target3000_zero_scrolls_validation.json
19.628833333333336
-- Avg summarization score for: llmlingua2_small - llama_3.1_70b_answer - metrics_target2000_zero_scrolls_validation.json
17.7843
-- Avg summarization score for: llmlingua2_small - llama_3.1_70b_answer - metrics_target3000_zero_scrolls_validation.json
19.175033333333335


## OpenAI pricing calculation function

In [26]:
import tiktoken

MODEL = "gpt-4o"
# Normal API pricing, for batch 50% off
pricing = {
    "gpt-3.5-turbo-0613": { "p_in": 1.5, "p_out": 2.0 },
    "gpt-3.5-turbo": { "p_in": 0.5, "p_out": 1.5 },
    "gpt-4": { "p_in": 30, "p_out": 60 },
    "gpt-4-turbo": { "p_in": 10, "p_out": 30 },
    "gpt-4o": { "p_in": 2.5, "p_out": 10 },
    "gpt-4o-mini": { "p_in": 0.15, "p_out": 0.6 },
}
get_pricing = lambda model: pricing[model]
get_cost = lambda t_in, t_out: t_in / 1000 * get_pricing(MODEL)["p_in"] / 1000 + t_out / 1000 * get_pricing(MODEL)["p_out"] / 1000

tokenizer = tiktoken.encoding_for_model(MODEL)

## Calculate MeetingBank repro cost

In [3]:
from collections import defaultdict
import json

prompt = "Write a high-quality answer for the given question using the provided meeting transcript (which may be compressed).\n{transcript}\nQuestion:{question}\nAnswer:"

in_tokens_total_mb, out_tokens_total_mb = defaultdict(int), defaultdict(int)

def calc(name, d_qa, d_sum):
    tok_in_qa = sum(len(tokenizer.encode(prompt.format(transcript=d["transcript"], question=q))) for d in d_qa for q in d["questions"])
    tok_out_qa = sum(len(tokenizer.encode(a)) for d in d_qa for a in d["model_answers"])
    in_tokens_total_mb[name] += tok_in_qa
    out_tokens_total_mb[name] += tok_out_qa
    print(f"{name} QA - in: {tok_in_qa:,}, out: {tok_out_qa:,}")

    tok_in_sum = sum(len(tokenizer.encode(d["transcript"])) for d in d_sum)
    tok_out_sum = sum(len(tokenizer.encode(d["model_summary"])) for d in d_sum)
    in_tokens_total_mb[name] += tok_in_sum
    out_tokens_total_mb[name] += tok_out_sum
    print(f"{name} Sum - in: {tok_in_sum:,}, out: {tok_out_sum:,}")

d_orig_qa = list(json.load(open("../results/meetingbank_short/origin/llama_3.1_70b_answer/answer_meetingbank_QA.json", "r")).values())
d_orig_sum = list(json.load(open("../results/meetingbank_short/origin/llama_3.1_70b_answer/answer_meetingbank_summary.json", "r")).values())
d_comp_qa = list(json.load(open("../results/meetingbank_short/llmlingua2/llama_3.1_70b_answer/answer_ratio33_meetingbank_QA.json", "r")).values())
d_comp_sum = list(json.load(open("../results/meetingbank_short/llmlingua2/llama_3.1_70b_answer/answer_ratio33_meetingbank_summary.json", "r")).values())
d_comp_small_qa = list(json.load(open("../results/meetingbank_short/llmlingua2_small/llama_3.1_70b_answer/answer_ratio33_meetingbank_QA.json", "r")).values())
d_comp_small_sum = list(json.load(open("../results/meetingbank_short/llmlingua2_small/llama_3.1_70b_answer/answer_ratio33_meetingbank_summary.json", "r")).values())

calc("Original", d_orig_qa, d_orig_sum)
calc("Compressed", d_comp_qa, d_comp_sum)
calc("Compressed Small", d_comp_small_qa, d_comp_small_sum)

Original QA - in: 7,050,072, out: 45,918
Original Sum - in: 2,312,304, out: 181,927
Compressed QA - in: 2,605,806, out: 48,353
Compressed Sum - in: 838,737, out: 171,834
Compressed Small QA - in: 2,390,988, out: 49,656
Compressed Small Sum - in: 767,158, out: 168,885


In [4]:
def calc_meetingbank_cost(print_res = False):
    cost_orig = get_cost(in_tokens_total_mb["Original"], out_tokens_total_mb["Original"])
    cost_comp = get_cost(in_tokens_total_mb["Compressed"], out_tokens_total_mb["Compressed"])
    cost_comp_small = get_cost(in_tokens_total_mb["Compressed Small"], out_tokens_total_mb["Compressed Small"])
    total_repro_cost_meetingbench = cost_comp + cost_orig
    if print_res:
        print("Target model:", MODEL)
        print(f"\nTotal original cost: ${cost_orig:.2f}")
        print(f"Total comp cost: ${cost_comp:.2f} - savings: ${cost_orig - cost_comp:.2f}")
        print(f"Total comp cost (small): ${cost_comp_small:.2f} - savings: ${cost_orig - cost_comp_small:.2f}")
        print(f"\nTotal repro cost: ${total_repro_cost_meetingbench:.2f}")

    return total_repro_cost_meetingbench

calc_meetingbank_cost(True)

Target model: gpt-4o

Total original cost: $25.68
Total comp cost: $10.81 - savings: $14.87
Total comp cost (small): $10.08 - savings: $15.60

Total repro cost: $36.50


36.497617500000004

## Calculate LongBench repro gpt-3.5 cost

In [170]:
# Depends on build_prompt_longbench (see above)
import json
import os
from collections import defaultdict

import pandas as pd


target_model = "gpt_4o_answer"
data_files = {
    "origin": "../results/longbench/origin/longbench_test_formatted.json",
    "llmlingua2_2000": "../results/longbench/llmlingua2/compression_target2000_longbench_test_formatted.json",
    "llmlingua2_3000": "../results/longbench/llmlingua2/compression_target3000_longbench_test_formatted.json",
    "llmlingua2_small_2000": "../results/longbench/llmlingua2_small/compression_target2000_longbench_test_formatted.json",
    "llmlingua2_small_2000": "../results/longbench/llmlingua2_small/compression_target3000_longbench_test_formatted.json",
    "zero_shot": "../results/longbench/zero_shot/longbench_test_formatted.json",
}
categories = {
    "Single-doc QA": ["multifieldqa_en", "multifieldqa_zh", "narrativeqa", "qasper"],
    "Multi-doc QA": ["hotpotqa", "2wikimqa", "musique", "dureader"],
    "Summarization": ["gov_report", "qmsum", "multi_news", "vcsum"],
    "Few-Shot": ["triviaqa", "samsum", "trec", "lsht"],
    "Synthetic": ["passage_count", "passage_retrieval_en", "passage_retrieval_zh"],
    "Code": ["lcc", "repobench-p"],
}
reverse_categories = {v: k for k, values in categories.items() for v in values}
reverse_categories["total"] = "Total"
tasknames = {
    "narrativeqa": "NarrativeQA",
    "qasper": "Qasper",
    "multifieldqa_en": "MultiFieldQA (EN)",
    "multifieldqa_zh": "MultiFieldQA (ZH)",
    "hotpotqa": "HotpotQA",
    "2wikimqa": "2WikiMultihopQA",
    "musique": "MuSiQue",
    "dureader": "DuReader (ZH)",
    "gov_report": "GovReport",
    "qmsum": "QMSum",
    "multi_news": "MultiNews",
    "vcsum": "VCSUM (ZH)",
    "trec": "TREC",
    "triviaqa": "TriviaQA",
    "samsum": "SAMSum",
    "lsht": "LSHT (ZH)",
    "passage_retrieval_en": "PassageRetrieval (EN)",
    "passage_count": "PassageCount",
    "passage_retrieval_zh": "PassageRetrieval (ZH)",
    "lcc": "LCC",
    "repobench-p": "RepoBench-P",
    "Average": "Average",
    "total": "Total",
}
col_order = [
    "Avg. In Origin",
    "Avg. In",
    "Ratio",
    "Saving In",
    "Avg. Out Origin",
    "Avg. Out",
    "Saving Out",
    "Cost Orig",
    "Cost",
    "Saving",
]

in_tokens_total_lb, out_tokens_total_lb = defaultdict(int), defaultdict(int)
scenario_data = {scenario: {} for scenario in data_files.keys()}
for scenario, data_file in data_files.items():
    get_split_name = lambda path: list(map(lambda part: part.replace(".json", ""), path.split("/")))
    filename_split = get_split_name(data_file)
    model, target = filename_split[-2], filename_split[-1].replace("compression_", "")
    file_dir = os.path.dirname(data_file)
    answer_file = f"{file_dir}/{target_model}/answer_{target}.json"
    if not os.path.exists(data_file) or not os.path.exists(answer_file):
        continue
    print(f"\n====== {model}: {target} ======")
    longbench_data = json.load(open(data_file, "r"))
    if isinstance(longbench_data, dict):
        longbench_data = list(longbench_data.values())
    compression = model in ["llmlingua2", "llmlingua2_small"]
    load_key = None if model == "zero_shot" else "compressed_prompt" if compression else "context"
    in_tokens = defaultdict(int)
    for sample in longbench_data:
        prompt = build_prompt_longbench(sample, load_key)
        in_tokens[sample["task"]] += len(tokenizer.encode(prompt))
    in_tokens_total = sum(in_tokens.values())
    in_tokens["total"] = in_tokens_total
    scenario_data[scenario]["in_tokens"] = in_tokens
    in_tokens_total_lb[model] += in_tokens_total
    res_data = list(json.load(open(answer_file, "r")).values())
    out_tokens = defaultdict(int)
    num_samples = defaultdict(int)
    num_samples["total"] = len(res_data)
    for d in res_data:
        out_tokens[d["task"]] += len(tokenizer.encode(d["pred"]))
        num_samples[d["task"]] += 1
    out_tokens_total = sum(out_tokens.values())
    out_tokens["total"] = out_tokens_total
    scenario_data[scenario]["out_tokens"] = out_tokens
    out_tokens_total_lb[model] += out_tokens_total

    if scenario == "origin":
        total_cost_origin = get_cost(in_tokens_total, out_tokens_total)
        print(
            f"Tokens in: {in_tokens_total:,} (avg: {in_tokens_total / len(longbench_data):,.0f})\n"
            f"Tokens out: {out_tokens_total:,}  (avg: {out_tokens_total / len(longbench_data):,.0f})\n"
            f"Cost: ${total_cost_origin:.2f}"
        )
    else:
        total_cost = get_cost(in_tokens_total, out_tokens_total)
        total_tokens = f"Tokens in: {in_tokens_total:,} (avg: {in_tokens_total / len(longbench_data):,.0f}) - saving: {(1 - in_tokens_total / scenario_data['origin']['in_tokens']['total']) * 100:.2f}%\n"
        total_tokens += f"Tokens out: {out_tokens_total:,}  (avg: {out_tokens_total / len(longbench_data):,.0f}) - saving: {(1 - out_tokens_total / scenario_data['origin']['out_tokens']['total']) * 100:.2f}%\n"
        total_tokens += f"Cost: ${total_cost:.2f} - saving: {(1 - total_cost / total_cost_origin) * 100:.2f}%"

        print("\nPer category averages:")
        per_cat = defaultdict(list)
        for category, tasks in categories.items():
            in_t = sum(in_tokens[task] for task in tasks)
            out_t = sum(out_tokens[task] for task in tasks)
            avg_in = in_t / sum(num_samples[task] for task in tasks)
            avg_out = out_t / sum(num_samples[task] for task in tasks)
            cost = get_cost(in_t, out_t)
            per_cat["Category"].append(category)
            per_cat["Avg. In"].append(round(avg_in, 1))
            per_cat["Avg. Out"].append(round(avg_out, 1))
            in_t_origin = sum(scenario_data["origin"]["in_tokens"][task] for task in tasks)
            out_t_origin = sum(scenario_data["origin"]["out_tokens"][task] for task in tasks)
            cost_orig = get_cost(in_t_origin, out_t_origin)
            avg_in_origin = in_t_origin / sum(num_samples[task] for task in tasks)
            per_cat["Avg. In Origin"].append(round(avg_in_origin, 1))
            per_cat["Ratio"].append(f"{avg_in_origin / avg_in:.1f}x")
            per_cat["Avg. Out Origin"].append(round(out_t_origin / sum(num_samples[task] for task in tasks), 1))
            per_cat["Saving In"].append(f"{(1 - in_t / in_t_origin) * 100:.1f}%")
            per_cat["Saving Out"].append(f"{(1 - out_t / out_t_origin) * 100:.1f}%")
            per_cat["Cost Orig"].append(f"${cost_orig:.2f}")
            per_cat["Cost"].append(f"${cost:.2f}")
            per_cat["Saving"].append(f"{(1 - cost / cost_orig) * 100:.1f}%")
        per_cat = pd.DataFrame(per_cat)
        per_cat.set_index("Category", inplace=True)
        per_cat = per_cat[col_order]
        display(per_cat)

        print(total_tokens, "\n\nPer task averages:")
        per_task = defaultdict(list)
        for task, in_t in in_tokens.items():
            avg_in = in_t / num_samples[task]
            avg_out = out_tokens[task] / num_samples[task]
            cost = get_cost(in_t, out_tokens[task])
            per_task["Category"].append(reverse_categories[task])
            per_task["Task"].append(tasknames[task])
            per_task["Avg. In"].append(round(avg_in, 1))
            per_task["Avg. Out"].append(round(avg_out, 1))
            in_t_origin, out_t_origin = (
                scenario_data["origin"]["in_tokens"][task],
                scenario_data["origin"]["out_tokens"][task],
            )
            cost_orig = get_cost(in_t_origin, out_t_origin)
            avg_in_origin = in_t_origin / num_samples[task]
            per_task["Avg. In Origin"].append(round(avg_in_origin, 1))
            per_task["Ratio"].append(f"{avg_in_origin / avg_in:.1f}x")
            per_task["Avg. Out Origin"].append(round(out_t_origin / num_samples[task], 1))
            per_task["Saving In"].append(f"{(1 - in_t / in_t_origin) * 100:.1f}%")
            per_task["Saving Out"].append(f"{(1 - out_tokens[task] / out_t_origin) * 100:.1f}%")
            per_task["Cost Orig"].append(f"${cost_orig:.2f}")
            per_task["Cost"].append(f"${cost:.2f}")
            per_task["Saving"].append(f"{(1 - cost / cost_orig) * 100:.1f}%")
        per_task = pd.DataFrame(per_task)
        per_task.set_index("Category", inplace=True)
        per_task = per_task.set_index("Task", append=True)
        per_task = per_task[col_order]
        per_cat["Task"] = "Average"
        per_cat = per_cat.set_index("Task", append=True)
        per_task = (
            pd.concat([per_task, per_cat])
            .sort_index(
                level=1,
                key=lambda col: col.map(
                    {
                        val: i
                        for i, val in enumerate([*[tasknames[t] for c in categories.values() for t in c], "Average"])
                    }
                ),
            )
            .sort_index(
                level=[0, 1],
                key=lambda col: col.map({val: i for i, val in enumerate([*list(categories.keys()), "Total"])}),
            )
        )
        display(per_task)


Tokens in: 48,191,169 (avg: 10,146)
Tokens out: 335,466  (avg: 71)
Cost: $123.83


Per category averages:


Unnamed: 0_level_0,Avg. In Origin,Avg. In,Ratio,Saving In,Avg. Out Origin,Avg. Out,Saving Out,Cost Orig,Cost,Saving
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Single-doc QA,11941.5,2784.5,4.3x,76.7%,12.1,10.6,12.5%,$22.48,$5.30,76.4%
Multi-doc QA,11790.2,2869.9,4.1x,75.7%,32.0,25.1,21.4%,$23.84,$5.94,75.1%
Summarization,9240.5,2436.4,3.8x,73.6%,291.7,272.5,6.6%,$20.81,$7.05,66.1%
Few-Shot,11004.6,2881.9,3.8x,73.8%,12.4,20.3,-63.8%,$22.11,$5.93,73.2%
Synthetic,10831.0,2961.0,3.7x,72.7%,2.3,2.7,-14.0%,$16.26,$4.46,72.6%
Code,7108.3,3250.9,2.2x,54.3%,56.2,55.7,0.8%,$18.33,$8.68,52.6%


Tokens in: 13,666,330 (avg: 2,877) - saving: 71.64%
Tokens out: 319,548  (avg: 67) - saving: 4.75%
Cost: $37.36 - saving: 69.83% 

Per task averages:


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg. In Origin,Avg. In,Ratio,Saving In,Avg. Out Origin,Avg. Out,Saving Out,Cost Orig,Cost,Saving
Category,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Single-doc QA,MultiFieldQA (EN),6877.1,2914.9,2.4x,57.6%,21.2,19.0,10.2%,$2.61,$1.12,57.0%
Single-doc QA,MultiFieldQA (ZH),5010.1,1968.6,2.5x,60.7%,15.6,13.6,12.9%,$2.54,$1.01,60.1%
Single-doc QA,NarrativeQA,29541.4,3149.5,9.4x,89.3%,4.5,3.7,17.4%,$14.78,$1.58,89.3%
Single-doc QA,Qasper,5071.2,3137.4,1.6x,38.1%,9.3,8.0,13.3%,$2.55,$1.58,38.0%
Single-doc QA,Average,11941.5,2784.5,4.3x,76.7%,12.1,10.6,12.5%,$22.48,$5.30,76.4%
Multi-doc QA,HotpotQA,12690.7,3256.9,3.9x,74.3%,4.5,3.7,17.9%,$6.35,$1.64,74.3%
Multi-doc QA,2WikiMultihopQA,7033.4,3029.8,2.3x,56.9%,4.8,4.4,9.1%,$3.53,$1.52,56.8%
Multi-doc QA,MuSiQue,15430.1,3327.3,4.6x,78.4%,6.0,4.6,24.1%,$7.73,$1.67,78.4%
Multi-doc QA,DuReader (ZH),12006.5,1865.7,6.4x,84.5%,112.5,87.8,21.9%,$6.23,$1.11,82.2%
Multi-doc QA,Average,11790.2,2869.9,4.1x,75.7%,32.0,25.1,21.4%,$23.84,$5.94,75.1%


In [172]:
import re

out_dir = "../../ma-thesis/tables"
os.makedirs(out_dir, exist_ok=True)

cols = [
    ["\\textbf{Input}", "$\\varnothing$ Size"],
    ["\\textbf{Input}", "$\\varnothing$ Comp."],
    ["\\textbf{Input}", "Ratio"],
    ["\\textbf{Input}", "Saving"],
    ["\\textbf{Output}", "$\\varnothing$ Size"],
    ["\\textbf{Output}", "$\\varnothing$ Comp."],
    ["\\textbf{Output}", "Saving"],
    ["\\textbf{Cost}", "Original"],
    ["\\textbf{Cost}", "Comp."],
    ["\\textbf{Cost}", "Saving"],
]
for df, filename in zip([per_task, per_cat], ["longbench_savings_per_task", "longbench_savings_per_category"]):
    df.columns = pd.MultiIndex.from_tuples(cols)
    df = df.map(lambda x: x.replace("%", r"\%").replace("$", "\$") if isinstance(x, str) else f"{x:,}")
    df = df.apply(
        lambda row: (row if row.name[1] not in ["Average", "Total"] else row.apply(lambda x: f"\\textbf{{{x}}}")),
        axis=1,
    )
    tex_table = df.to_latex(
        float_format="%.1f",
        multicolumn_format="c|",
        column_format="@{}l@{\hskip 2pt}|rrrr|rrr|rrr@{}",
        index_names=False,
    )
    tex_table = (
        tex_table.replace("\\cline{1-12}", "\\midrule", 6)
        .replace("\\cline{1-12}", "")
        .replace("{c|}{\\textbf{C", "{c}{\\textbf{C")
        # .replace("-doc", "-d.")
        # .replace("Summarization", "Summariz.")
        # .replace("Total &", "\\multicolumn{2}{@{}l}{\\textbf{Total}}")
    )
    for match in ["Average", "Total"]:
        tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")
    # tex_table = re.sub(
    #     r"\\multirow\[t\]{(\d+)}{.*}{(.*)}",
    #     r"\\parbox[t]{2mm}{\\multirow{\1}{*}{\\rotatebox[origin=c]{90}{\\textbf{\2}}}}",
    #     tex_table,
    # )
    for category in categories:
        tex_table = tex_table.replace("Average", category, 1)
    tex_table = re.sub(r"^.*?&", "", tex_table, flags=re.M)  # Remove category column
    with open(f"{out_dir}/{filename}.tex", "w") as f:
        f.write(tex_table)

In [8]:
def calc_longbench_cost(print_res = False):
    cost_origin = get_cost(in_tokens_total_lb["origin"], out_tokens_total_lb["origin"])
    cost_comp = get_cost(in_tokens_total_lb["llmlingua2"], out_tokens_total_lb["llmlingua2"])
    cost_comp_small = get_cost(in_tokens_total_lb["llmlingua2_small"], out_tokens_total_lb["llmlingua2_small"])
    cost_zero_shot = get_cost(in_tokens_total_lb["zero_shot"], out_tokens_total_lb["zero_shot"])

    total_repro_cost_longbench = cost_comp + cost_origin
    cost_comp_3k = get_cost(14649554, 318541)
    if print_res:
        print("Target model:", MODEL)
        print(f"\nTotal original cost: ${cost_origin:.2f}")
        print(f"Total comp cost: ${cost_comp:.2f} - savings: ${cost_origin - cost_comp:.2f}")
        print(f"Total comp cost (small): ${cost_comp_small:.2f} - savings: ${cost_origin - cost_comp_small:.2f}")
        print(f"Total zero-shot cost: ${cost_zero_shot:.2f} - savings: ${cost_origin - cost_zero_shot:.2f}")
        print(f"\nTotal repro cost: ${total_repro_cost_longbench:.2f}")
        print(f"Total comp cost (3k): ${cost_comp_3k:.2f} - savings: ${cost_origin - cost_comp_3k:.2f}")
    
    return total_repro_cost_longbench

In [9]:
calc_longbench_cost(True)

Target model: gpt-4o

Total original cost: $123.78
Total comp cost: $37.08 - savings: $86.70
Total comp cost (small): $72.18 - savings: $51.60
Total zero-shot cost: $3.79 - savings: $119.99

Total repro cost: $160.86
Total comp cost (3k): $39.81 - savings: $83.97


160.85512749999998

## Calculate ZeroScrolls repro gpt-3.5 cost

In [10]:
import json
import os
from collections import defaultdict


target_model = "llama_3.1_70b_answer"
data_files = [
    "../results/zero_scrolls/origin/zero_scrolls_validation.json",
    # "../results/zero_scrolls/llmlingua2/compression_target2000_zero_scrolls_validation.json",
    "../results/zero_scrolls/llmlingua2/compression_target3000_zero_scrolls_validation.json",
    "../results/zero_scrolls/llmlingua2_small/compression_target2000_zero_scrolls_validation.json",
    "../results/zero_scrolls/llmlingua2_small/compression_target3000_zero_scrolls_validation.json",
    # "../results/zero_scrolls/zero_shot/zero_scrolls_validation.json",
]
in_tokens_total_zs, out_tokens_total_zs = defaultdict(int), defaultdict(int)
for data_file in data_files:
    get_split_name = lambda path: list(map(lambda part: part.replace(".json", ""), path.split("/")))
    filename_split = get_split_name(data_file)
    model, target = filename_split[-2], filename_split[-1].replace("compression_", "")
    print(f"\n====== {model}: {target} ======")
    zero_scrolls_data = json.load(open(data_file, "r"))
    if isinstance(zero_scrolls_data, dict):
        zero_scrolls_data = list(zero_scrolls_data.values())
    compression = model in ["llmlingua2", "llmlingua2_small"]
    load_key = "compressed_prompt" if compression else "prompt"
    in_tokens = 0
    for sample in zero_scrolls_data:
        prompt = sample[load_key]
        in_tokens += len(tokenizer.encode(prompt))
    in_tokens_total_zs[model] += in_tokens
    print(f"Tokens in: {in_tokens:,}")
    file_dir = os.path.dirname(data_file)
    answer_file = f"{file_dir}/{target_model}/answer_{target}.json"
    res_data = list(json.load(open(answer_file, "r")).values())
    out_tokens = sum(len(tokenizer.encode(d["pred"])) for d in res_data)
    out_tokens_total_zs[model] += out_tokens
    print(f"Tokens out: {out_tokens:,}")


Tokens in: 2,593,614
Tokens out: 34,106

Tokens in: 797,605
Tokens out: 30,781

Tokens in: 562,490
Tokens out: 28,921

Tokens in: 802,676
Tokens out: 31,674


In [11]:
def calc_zero_scrolls_cost(print_res = False):
    cost_origin = get_cost(in_tokens_total_zs["origin"], out_tokens_total_zs["origin"])
    cost_comp = get_cost(in_tokens_total_zs["llmlingua2"], out_tokens_total_zs["llmlingua2"])
    cost_comp_small = get_cost(in_tokens_total_zs["llmlingua2_small"], out_tokens_total_zs["llmlingua2_small"])
    # cost_zero_shot = get_cost(in_tokens_total["zero_shot"], out_tokens_total["zero_shot"])

    total_cost_zero_scrolls = cost_comp + cost_origin  #+ cost_zero_shot
    cost_comp_3k = get_cost(808268, 30872)
    if print_res:
        print("Target model:", MODEL)
        print(f"\nTotal original cost: ${cost_origin:.2f}")
        print(f"Total comp cost: ${cost_comp:.2f} - savings: ${cost_origin - cost_comp:.2f}")
        print(f"Total comp cost (small): ${cost_comp_small:.2f} - savings: ${cost_origin - cost_comp_small:.2f}")
        # print(f"Total zero-shot cost: ${cost_zero_shot:.2f} - savings: ${cost_origin - cost_zero_shot:.2f}")
        print(f"\nTotal repro cost: ${total_cost_zero_scrolls:.2f}")
        print(f"Total comp cost (3k): ${cost_comp_3k:.2f} - savings: ${cost_origin - cost_comp_3k:.2f}")
    
    return total_cost_zero_scrolls

In [12]:
calc_zero_scrolls_cost(True)

Target model: gpt-4o

Total original cost: $6.83
Total comp cost: $2.30 - savings: $4.52
Total comp cost (small): $4.02 - savings: $2.81

Total repro cost: $9.13
Total comp cost (3k): $2.33 - savings: $4.50


9.1269175

## Calculate GSM8K repro gpt-3.5 cost

In [13]:
import json
import os
from collections import defaultdict

target_model = "llama_3.1_70b_answer"
data_files_gsm8k = [
    "../results/gsm8k/origin/gsm8k_cot_example.json",
    # "../results/gsm8k/llmlingua2/compression_target160_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2/compression_target400_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2_small/compression_target160_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2_small/compression_target400_gsm8k_cot_example.json",
    "../results/gsm8k/zero_shot/zero_shot.json",  # This file does not exist
]


def build_prompt_gsm8k(q, demonstrations):
    query = f"Question: {q}" + "\nLet's think step by step."
    return (
        query
        if not demonstrations
        else f"Please reference the following examples to answer the math question. \n {demonstrations}\n\n" + query
    )


in_tokens_total_gsm8k, out_tokens_total_gsm8k = defaultdict(int), defaultdict(int)
for scenario in data_files_gsm8k:
    get_split_name = lambda path: list(map(lambda part: part.replace(".json", ""), path.split("/")))
    filename_split = get_split_name(scenario)
    model, target = filename_split[-2], filename_split[-1].replace("compression_", "")
    print(f"\n====== {model}: {target} ======")
    compression = model in ["llmlingua2", "llmlingua2_small"]
    demonstrations = ""
    if model != "zero_shot":
        dem_list = json.load(open(scenario, "r"))
        if isinstance(dem_list, dict):
            dem_list = list(dem_list.values())
        dem_list = dem_list[0]["compressed_prompt_list" if compression else "prompt_list"]
        demonstrations = "\n\n".join(["Question: " + dem for dem in dem_list])

    file_dir = os.path.dirname(scenario)
    answer_file = f"{file_dir}/{target_model}/answer_{target}.json"
    res_data = list(json.load(open(answer_file, "r")).values())
    in_tokens = 0
    for sample in res_data:
        prompt = build_prompt_gsm8k(sample["question"], demonstrations)
        in_tokens += len(tokenizer.encode(prompt))
    in_tokens_total_gsm8k[model] += in_tokens
    print(f"Tokens in: {in_tokens:,}")

    out_tokens = sum(len(tokenizer.encode(d["model_answer"])) for d in res_data)
    out_tokens_total_gsm8k[model] += out_tokens
    print(f"Tokens out: {out_tokens:,}")


Tokens in: 3,190,968
Tokens out: 216,268

Tokens in: 682,230
Tokens out: 248,501

Tokens in: 318,186
Tokens out: 245,343

Tokens in: 703,334
Tokens out: 248,785

Tokens in: 87,361
Tokens out: 281,328


In [14]:
def calc_gsm8k_cost(print_res = False):
    cost_origin = get_cost(in_tokens_total_gsm8k["origin"], out_tokens_total_gsm8k["origin"])
    cost_comp = get_cost(in_tokens_total_gsm8k["llmlingua2"], out_tokens_total_gsm8k["llmlingua2"])
    cost_comp_small = get_cost(in_tokens_total_gsm8k["llmlingua2_small"], out_tokens_total_gsm8k["llmlingua2_small"])
    cost_zero_shot = get_cost(in_tokens_total_gsm8k["zero_shot"], out_tokens_total_gsm8k["zero_shot"])

    total_repro_cost_gsm8k = cost_comp + cost_origin + cost_zero_shot
    cost_comp_1_shot = get_cost(686804, 250652)
    if print_res:
        print("Target model:", MODEL)
        print(f"\nTotal original cost: ${cost_origin:.2f}")
        print(f"Total comp cost: ${cost_comp:.2f} - savings: ${cost_origin - cost_comp:.2f}")
        print(f"Total comp cost (small): ${cost_comp_small:.2f} - savings: ${cost_origin - cost_comp_small:.2f}")
        print(f"Total zero-shot cost: ${cost_zero_shot:.2f} - savings: ${cost_origin - cost_zero_shot:.2f}")
        print(f"\nTotal repro cost: ${total_repro_cost_gsm8k:.2f}")
        print(f"Total comp cost (1-shot): ${cost_comp_1_shot:.2f} - savings: ${cost_origin - cost_comp_1_shot:.2f}")

    return total_repro_cost_gsm8k

In [15]:
calc_gsm8k_cost(True)

Target model: gpt-4o

Total original cost: $10.14
Total comp cost: $4.19 - savings: $5.95
Total comp cost (small): $7.50 - savings: $2.65
Total zero-shot cost: $3.03 - savings: $7.11

Total repro cost: $17.36
Total comp cost (1-shot): $4.22 - savings: $5.92


17.3623675

## Calculate BBH repro gpt-3.5 cost

In [180]:
import json
import os
from collections import defaultdict

target_model = "llama_3.1_70b_answer"
data_files_bbh = {
    "origin": "../results/bbh/origin/bbh_cot_examples.json",
    "llmlingua2_200": "../results/bbh/llmlingua2/compression_target200_bbh_cot_examples.json",
    "llmlingua2_300": "../results/bbh/llmlingua2/compression_target300_bbh_cot_examples.json",
    "llmlingua2_small_200": "../results/bbh/llmlingua2_small/compression_target200_bbh_cot_examples.json",
    "llmlingua2_small_300": "../results/bbh/llmlingua2_small/compression_target300_bbh_cot_examples.json",
    "zero-shot": "../results/bbh/zero_shot/bbh_cot_examples.json",
}


def build_prompt_bbh(sample, task, prompts, instructions):
    cot_prompt = prompts[task]
    instruction = instructions[task]

    if cot_prompt and cot_prompt[0] != "\n":
        cot_prompt = "\n\n" + cot_prompt
    return f"{instruction}{cot_prompt}\n\nQ: {sample['question']}\nA: Let's think step by step.\n"


bbh_dataset = json.load(open("../results/bbh/origin/bbh.json", "r"))
in_tokens_total_bbh, out_tokens_total_bbh = defaultdict(int), defaultdict(int)
in_tokens, out_tokens = defaultdict(int), defaultdict(int)
for scenario, filepath in data_files_bbh.items():
    get_split_name = lambda path: list(map(lambda part: part.replace(".json", ""), path.split("/")))
    filename_split = get_split_name(filepath)
    model, target = filename_split[-2], filename_split[-1].replace("compression_", "")
    file_dir = os.path.dirname(filepath)
    answer_file = f"{file_dir}/{target_model}/answer_{target}.json"
    if not os.path.exists(filepath) or not os.path.exists(answer_file):
        continue
    print(f"\n====== {model}: {target} ======")
    compression = model in ["llmlingua2", "llmlingua2_small"]
    load_key = None if model == "zero_shot" else "compressed_prompt" if compression else "prompt"

    demonstrations = json.load(open(filepath))
    if isinstance(demonstrations, dict):
        demonstrations = list(demonstrations.values())
    prompts, instructions = {}, {}
    for demon in demonstrations:
        task = demon["task"]
        prompt = demon[load_key] if load_key is not None else ""
        instructions[task] = demon["instruction"]
        prompts[task] = prompt

    res_data = list(json.load(open(answer_file, "r")).values())
    for i, sample in enumerate(res_data):
        task = bbh_dataset[i]["task"]
        prompt = build_prompt_bbh(sample, task, prompts, instructions)
        in_tokens[scenario] += len(tokenizer.encode(prompt))
    in_tokens_total_bbh[model] += in_tokens[scenario]
    out_tokens[scenario] = sum(len(tokenizer.encode(d["model_answer"])) for d in res_data)
    out_tokens_total_bbh[model] += out_tokens[scenario]
    cost_scenario = get_cost(in_tokens[scenario], out_tokens[scenario])
    print_str = f"Tokens in: {in_tokens[scenario]:,} (avg: {in_tokens[scenario] / 6511:,}), Tokens out: {out_tokens[scenario]:,} (avg: {out_tokens[scenario] / 6511:,}) - Cost: ${cost_scenario:.2f}"
    if scenario != "origin":
        print_str += f"\nRatio: {in_tokens['origin'] / in_tokens[scenario]:.2f}"
        print_str += f'\nSaving tokens: {(1 - in_tokens[scenario] / in_tokens["origin"]) * 100:.2f}% / {(1 - out_tokens[scenario] / out_tokens["origin"]) * 100:.2f}%'
        print_str += f'\nSaving cost: {(1 - cost_scenario / cost_origin) * 100:.2f}%'
    else:
        cost_origin = cost_scenario
    print(print_str)


Tokens in: 5,674,570 (avg: 871.5358623867302), Tokens out: 1,345,994 (avg: 206.726155736446) - Cost: $27.65

Tokens in: 1,812,222 (avg: 278.33236062048843), Tokens out: 1,422,850 (avg: 218.53017969589925) - Cost: $18.76
Ratio: 3.13
Saving tokens: 68.06% / -5.71%
Saving cost: 32.15%

Tokens in: 2,311,556 (avg: 355.0231915220396), Tokens out: 1,414,965 (avg: 217.31915220396252) - Cost: $19.93
Ratio: 2.45
Saving tokens: 59.26% / -5.12%
Saving cost: 27.92%

Tokens in: 1,894,841 (avg: 291.02150207341424), Tokens out: 1,433,199 (avg: 220.11964367992627) - Cost: $19.07
Ratio: 2.99
Saving tokens: 66.61% / -6.48%
Saving cost: 31.02%

Tokens in: 2,458,865 (avg: 377.64782675472276), Tokens out: 1,471,605 (avg: 226.01827676240208) - Cost: $20.86
Ratio: 2.31
Saving tokens: 56.67% / -9.33%
Saving cost: 24.54%

Tokens in: 780,219 (avg: 119.830901551221), Tokens out: 1,535,248 (avg: 235.79296575026876) - Cost: $17.30
Ratio: 7.27
Saving tokens: 86.25% / -14.06%
Saving cost: 37.41%


In [17]:
def calc_bbh_cost(print_res = False):
    cost_origin = get_cost(in_tokens_total_bbh["origin"], out_tokens_total_bbh["origin"])
    cost_comp = get_cost(in_tokens_total_bbh["llmlingua2"], out_tokens_total_bbh["llmlingua2"])
    cost_comp_small = get_cost(in_tokens_total_bbh["llmlingua2_small"], out_tokens_total_bbh["llmlingua2_small"])
    cost_zero_shot = get_cost(in_tokens_total_bbh["zero_shot"], out_tokens_total_bbh["zero_shot"])

    total_repro_cost_bbh = cost_comp + cost_origin + cost_zero_shot
    cost_comp_1_shot = get_cost(2339622, 1438517)
    if print_res:
        print("Target model:", MODEL)
        print(f"\nTotal original cost: ${cost_origin:.2f}")
        print(f"Total comp cost: ${cost_comp:.2f} - savings: ${cost_origin - cost_comp:.2f}")
        print(f"Total comp cost (small): ${cost_comp_small:.2f} - savings: ${cost_origin - cost_comp_small:.2f}")
        print(f"Total zero-shot cost: ${cost_zero_shot:.2f} - savings: ${cost_origin - cost_zero_shot:.2f}")
        print(f"\nTotal repro cost: ${total_repro_cost_bbh:.2f}")
        print(f"Total comp cost (1-shot): ${cost_comp_1_shot:.2f} - savings: ${cost_origin - cost_comp_1_shot:.2f}")
    return total_repro_cost_bbh

In [18]:
calc_bbh_cost(True)

Target model: gpt-4o

Total original cost: $27.65
Total comp cost: $19.93 - savings: $7.72
Total comp cost (small): $39.93 - savings: $-12.29
Total zero-shot cost: $17.30 - savings: $10.34

Total repro cost: $64.88
Total comp cost (1-shot): $20.23 - savings: $7.41


64.8779325

## Total repro cost

In [21]:
# Get total repro cost
print("Model:", MODEL, "\n")
cost_meetingbench = calc_meetingbank_cost()
print(f"MeetingBench: \t${cost_meetingbench:.2f} \t\t (Batch: ${cost_meetingbench / 2:.2f})")
cost_longbench = calc_longbench_cost()
print(f"LongBench: \t${cost_longbench:.2f} \t (Batch: ${cost_longbench / 2:.2f})")
cost_zero_scrolls = calc_zero_scrolls_cost()
print(f"Zero Scrolls: \t${cost_zero_scrolls:.2f} \t\t (Batch: ${cost_zero_scrolls / 2:.2f})")
cost_gsm8k = calc_gsm8k_cost()
print(f"GSM8K: \t\t${cost_gsm8k:.2f} \t\t (Batch: ${cost_gsm8k / 2:.2f})")
cost_bbh = calc_bbh_cost()
print(f"BBH: \t\t${cost_bbh:.2f} \t\t (Batch: ${cost_bbh / 2:.2f})")

total_repro_cost = cost_meetingbench + cost_longbench + cost_zero_scrolls + cost_gsm8k + cost_bbh
print(f"\nTotal: \t\t${total_repro_cost:.2f}\nBatch: \t\t${total_repro_cost / 2:.2f}")

Model: gpt-4o 

MeetingBench: 	$36.50 		 (Batch: $18.25)
LongBench: 	$160.86 	 (Batch: $80.43)
Zero Scrolls: 	$9.13 		 (Batch: $4.56)
GSM8K: 		$17.36 		 (Batch: $8.68)
BBH: 		$64.88 		 (Batch: $32.44)

Total: 		$288.72
Batch: 		$144.36


## GPT-3.5 e2e Benchmark Cost

In [7]:
import itertools


# gpt-3.5-turbo-0125
COST_IN = 0.5
COST_OUT = 1.5
# gpt-4o-mini
COST_IN = 0.15
COST_OUT = 0.6

input_sizes = [100, 250, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 12000, 16000]  # Input token sizes
rates = [1, 2 / 3, 1 / 2, 1 / 3, 1 / 5]  # 1x, 1.5x, 2x, 3x, 5x compression rates
output_sizes = [1, 10, 25, 50, 100]
iterations = 12

in_tokens, out_tokens = [], []
requests = 0

for output_size, input_size, rate in (prog := itertools.product(output_sizes, input_sizes, rates)):
    in_tokens.append(input_size * rate * iterations)
    out_tokens.append(output_size * iterations)
    requests += 12

print(f"Input tokens: {sum(in_tokens):,}")
print(f"Output tokens: {sum(out_tokens):,}")
print(f"Requests: {requests}")
print(f"Cost: ${sum(in_tokens) / 1_000_000 * COST_IN + sum(out_tokens) / 1_000_000 * COST_OUT:.2f}")

Input tokens: 10,505,700.0
Output tokens: 145,080
Requests: 3900
Cost: $1.66


## Result Tables Rendering

In [1]:
import os
import json

import numpy as np
import pandas as pd

models = ["mistral7b_answer", "llama_3.1_70b_answer", "command_r_plus_answer", "gpt_4o_answer", "gpt_4o_mini_answer"]
scenarios = ["llmlingua2_small", "llmlingua2", "origin"]

# Meetingbank
outdir = "../../ma-thesis/tables/meetingbank/"
os.makedirs(outdir, exist_ok=True)
columns = pd.MultiIndex.from_tuples(
    [
        ("QA", "EM"),
        ("Summary", "BLEU"),
        ("Summary", "ROUGE-1"),
        ("Summary", "ROUGE-2"),
        ("Summary", "ROUGE-L"),
        ("Summary", "BERTScore"),
        ("Tokens", ""),
        ("Ratio", "")
    ]
)
index = ["LLMLingua-2-small", "LLMLingua-2", "Original"]
df = pd.DataFrame(columns=columns, index=index)
size_and_ratio = [[895, 3.4], [982, 3.1], [3004, 1.0]]

for model in models:
    df_m = df.copy()
    rows = []
    for i, scenario in enumerate(scenarios):
        qa_file = "metrics_meetingbank_QA" if scenario == "origin" else "metrics_ratio33_meetingbank_QA"
        sum_file = "metrics_meetingbank_summary" if scenario == "origin" else "metrics_ratio33_meetingbank_summary"
        qa_path = f"../results/meetingbank_short/{scenario}/{model}/{qa_file}.json"
        sum_path = f"../results/meetingbank_short/{scenario}/{model}/{sum_file}.json"
        if not os.path.exists(qa_path) or not os.path.exists(sum_path):
            rows.append([None] * 8)
            continue
        qa = list(json.load(open(qa_path, "r")).values())[0]
        summary = np.array([*json.load(open(sum_path, "r")).values()]) * 100
        summary = [*summary[:4], summary[-1]]
        rows.append([qa, *summary, *size_and_ratio[i]])
    df_m.loc[:, :] = rows
    df_m = df_m.dropna(how='all')
    df_m[("Tokens", "")] = df_m[("Tokens", "")].map(lambda x: f"{x:,}")
    df_m[("Ratio", "")] = df_m[("Ratio", "")].map(lambda x: f"{x:.1f}x" if isinstance(x, float) else x)
    tex_table = df_m.to_latex(float_format="%.2f", multicolumn_format="c|", column_format="@{}l|c|ccccc|cc@{}", index_names=False)
    for match in ["QA", "Summary", "Tokens", "Ratio"]:
        tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")
    tex_table = tex_table.replace("\\\\\nOrig", "\\\\\\midrule\nOrig", 1)
    with open(f'{outdir}/{model.split("_answer")[0]}.tex', "w") as f:
        f.write(tex_table)
    # print(f"\nModel: {model}")
    # display(df_m)

In [29]:
# GSM8K

import json
import os

import numpy as np
import pandas as pd

models = ["mistral7b_answer", "llama_3.1_70b_answer", "command_r_plus_answer", "gpt_4o_answer", "gpt_4o_mini_answer"]
scenarios = [
    "../results/gsm8k/origin/{model}/metrics_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2/{model}/metrics_target400_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2_small/{model}/metrics_target400_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2/{model}/metrics_target160_gsm8k_cot_example.json",
    "../results/gsm8k/llmlingua2_small/{model}/metrics_target160_gsm8k_cot_example.json",
    "../results/gsm8k/zero_shot/{model}/metrics_zero_shot.json",
]

outdir = "../../ma-thesis/tables/gsm8k/"
os.makedirs(outdir, exist_ok=True)

columns = pd.MultiIndex.from_tuples(
    [
        ("Original", ""),
        ("Target 400", "LLMLingua-2"),
        ("Target 400", "LLMLingua-2-small"),
        ("Target 160", "LLMLingua-2"),
        ("Target 160", "LLMLingua-2-small"),
        ("Zero-Shot", ""),
    ]
)
index = [
    "Score",
    # "Tokens",
    # "Ratio",
]
df = pd.DataFrame(columns=columns, index=index)

for model in models:
    df_m = df.copy()
    rows = [
        [],
        # [2366, 440, 455, 157, 159, "---"],
        # [1.0, 5.4, 5.2, 15.1, 14.9, "---"],
    ]
    for i, scenario in enumerate(scenarios):
        res_file = scenario.format(model=model)
        if not os.path.exists(res_file):
            rows[0].append(None)
            continue
        score = json.load(open(res_file, "r"))["score"] * 100
        rows[0].append(score)
    df_m.loc[:, :] = rows
    df_m = df_m.dropna(how="any", axis=1)
    # df_m.loc["Ratio"] = df_m.loc["Ratio"].map(lambda x: f"{x:.1f}x" if isinstance(x, float) else x)
    # print(f"\nModel: {model}")
    # display(df_m)

    tex_table = df_m.to_latex(
        float_format="%.2f",
        multicolumn_format="c|",
        column_format="@{}l|c|cc|cc|c@{}" if len(df_m.columns) > 3 else "@{}l|c|c|c@{}",
        index_names=False,
    )
    for match in ["Original", "Target 400", "Target 160", "Zero-Shot"]:
        tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")
    tex_table = tex_table.replace("\\\\\nTokens", "\\\\\\midrule\nTokens", 1)
    with open(f'{outdir}/{model.split("_answer")[0]}.tex', "w") as f:
        f.write(tex_table)

In [8]:
# LongBench / ZeroScrolls

import json
import os

import numpy as np
import pandas as pd

models = ["mistral7b_answer", "llama_3.1_70b_answer", "command_r_plus_answer", "gpt_4o_answer", "gpt_4o_mini_answer"]
lb_categories = {
    "Single-doc QA": ["multifieldqa_en", "multifieldqa_zh", "narrativeqa", "qasper"],
    "Multi-doc QA": ["hotpotqa", "2wikimqa", "musique", "dureader"],
    "Summarization": ["gov_report", "qmsum", "multi_news", "vcsum"],
    "Few-Shot": ["triviaqa", "samsum", "trec", "lsht"],
    "Synthetic": ["passage_count", "passage_retrieval_en", "passage_retrieval_zh"],
    "Code": ["lcc", "repobench-p"],
}
lb_tasknames = {
    "narrativeqa": "NarrativeQA",
    "qasper": "Qasper",
    "multifieldqa_en": "MultiFieldQA (EN)",
    "multifieldqa_zh": "MultiFieldQA (ZH)",
    "hotpotqa": "HotpotQA",
    "2wikimqa": "2WikiMultihopQA",
    "musique": "MuSiQue",
    "dureader": "DuReader (ZH)",
    "gov_report": "GovReport",
    "qmsum": "QMSum",
    "multi_news": "MultiNews",
    "vcsum": "VCSUM (ZH)",
    "trec": "TREC",
    "triviaqa": "TriviaQA",
    "samsum": "SAMSum",
    "lsht": "LSHT (ZH)",
    "passage_retrieval_en": "PassageRetrieval (EN)",
    "passage_count": "PassageCount",
    "passage_retrieval_zh": "PassageRetrieval (ZH)",
    "lcc": "LCC",
    "repobench-p": "RepoBench-P",
    "avg": "Average",
}
zero_scrolls_tasknames = {
    "gov_report": "GovReport",
    "summ_screen_fd": "SummScreenFD",
    "qmsum": "QMSum",
    "qasper": "Qasper",
    "narrative_qa": "NarrativeQA",
    "quality": "QuALITY",
    "musique": "MuSiQue",
    "squality": "SQuALITY",
    "space_digest": "SpaceDigest",
    "book_sum_sort": "BookSumSort",
    "avg": "Average",
}
zero_scrolls_categories = {
    "Summarization": ["gov_report", "qmsum", "summ_screen_fd", "squality"],
    "QA": ["qasper", "narrative_qa", "musique", "quality"],
    "Data Processing": ["space_digest", "book_sum_sort"],
}
bbh_tasknames = {
    "multistep_arithmetic_two": "Multi-Step Arithmetic",
    "boolean_expressions": "Boolean Expressions",
    "logical_deduction_three_objects": "Logical Deduction (3 Obj.)",
    "logical_deduction_five_objects": "Logical Deduction (5 Obj.)",
    "logical_deduction_seven_objects": "Logical Deduction (7 Obj.)",
    "geometric_shapes": "Geometric Shapes",
    "dyck_languages": "Dyck Languages",
    "navigate": "Navigate",
    "temporal_sequences": "Temporal Sequences",
    "formal_fallacies": "Formal Fallacies",
    "object_counting": "Object Counting",
    "penguins_in_a_table": "Penguins in a Table",
    "tracking_shuffled_objects_three_objects": "Track. 3 Shuffled Obj.",
    "tracking_shuffled_objects_five_objects": "Track. 5 Shuffled Obj.",
    "tracking_shuffled_objects_seven_objects": "Track. 7 Shuffled Obj.",
    "reasoning_about_colored_objects": "Reasoning about Col. Obj.",
    "web_of_lies": "Web of Lies",
    "word_sorting": "Word Sorting",
    "disambiguation_qa": "Disambiguation QA",
    "hyperbaton": "Hyperbaton",
    "salient_translation_error_detection": "Salient Transl. Err. Detection",
    "snarks": "Snarks",
    "sports_understanding": "Sports Understanding",
    "movie_recommendation": "Movie Recommendation",
    "date_understanding": "Date Understanding",
    "causal_judgement": "Causal Judgement",
    "ruin_names": "Ruin Names",
    "avg": "Average",
}
benchmarks = {
    "longbench": {
        "scen": [
            "../results/longbench/origin/{model}/metrics_longbench_test_formatted.json",
            "../results/longbench/llmlingua2/{model}/metrics_target3000_longbench_test_formatted.json",
            "../results/longbench/llmlingua2_small/{model}/metrics_target3000_longbench_test_formatted.json",
            "../results/longbench/llmlingua2/{model}/metrics_target2000_longbench_test_formatted.json",
            "../results/longbench/llmlingua2_small/{model}/metrics_target2000_longbench_test_formatted.json",
            "../results/longbench/zero_shot/{model}/metrics_longbench_test_formatted.json",
        ],
        "tasks": [t for ts in lb_categories.values() for t in ts] + ["avg"],
        "tasknames": lb_tasknames,
        "key": "score",
        "targets": ["Target 3000", "Target 2000"],
        "zero_shot": ["triviaqa", "samsum", "trec", "lsht", "repobench-p"],
    },
    "zero_scrolls": {
        "scen": [
            "../results/zero_scrolls/origin/{model}/metrics_zero_scrolls_validation.json",
            "../results/zero_scrolls/llmlingua2/{model}/metrics_target3000_zero_scrolls_validation.json",
            "../results/zero_scrolls/llmlingua2_small/{model}/metrics_target3000_zero_scrolls_validation.json",
            "../results/zero_scrolls/llmlingua2/{model}/metrics_target2000_zero_scrolls_validation.json",
            "../results/zero_scrolls/llmlingua2_small/{model}/metrics_target2000_zero_scrolls_validation.json",
        ],
        "tasks": [t for ts in zero_scrolls_categories.values() for t in ts] + ["avg"],
        "tasknames": zero_scrolls_tasknames,
        "key": "zero_scrolls_score",
        "targets": ["Target 3000", "Target 2000"],
    },
    "bbh": {
        "scen": [
            "../results/bbh/origin/{model}/metrics_bbh_cot_examples.json",
            "../results/bbh/llmlingua2/{model}/metrics_target300_bbh_cot_examples.json",
            "../results/bbh/llmlingua2_small/{model}/metrics_target300_bbh_cot_examples.json",
            "../results/bbh/llmlingua2/{model}/metrics_target200_bbh_cot_examples.json",
            "../results/bbh/llmlingua2_small/{model}/metrics_target200_bbh_cot_examples.json",
            "../results/bbh/zero_shot/{model}/metrics_bbh_cot_examples.json",
        ],
        "tasks": list(bbh_tasknames.keys()),
        "tasknames": bbh_tasknames,
        "key": "acc",
        "targets": ["Target 300", "Target 200"],
        "zero_shot": True,
    }
}

for benchmark in benchmarks:
    outdir = f"../../ma-thesis/tables/{benchmark}"
    os.makedirs(outdir, exist_ok=True)

    scen = benchmarks[benchmark]["scen"]
    tasks = benchmarks[benchmark]["tasks"]
    tasknames = benchmarks[benchmark]["tasknames"]
    key = benchmarks[benchmark]["key"]
    targets = benchmarks[benchmark]["targets"]

    for model in models:
        columns = [("Original", "")]
        for target in targets:
            columns.append((target, "LLMLingua-2"))
            columns.append((target, "LLMLingua-2-small"))
        if "zero_shot" in benchmarks[benchmark]:
            columns.append(("Zero-Shot", ""))

        df = pd.DataFrame(index=tasks, columns=pd.MultiIndex.from_tuples(columns))

        for s, col in zip(scen, df.columns):
            scorepath = s.format(model=model)
            if not os.path.exists(scorepath):
                continue
            scores = json.load(open(scorepath))
            for task in tasks:
                if "zero_shot" in s and task not in benchmarks[benchmark]["zero_shot"]:
                    df.loc[task, col] = ""
                else:
                    df.loc[task, col] = (scores[task][key] if task != "avg" else scores["avg"]) * (
                        100 if key == "acc" else 1
                    )

        df = df.dropna(how="all", axis=1)
        df.index = df.index.map(lambda x: tasknames[x])
        df = df.apply(
            lambda row: (
                row
                if row.name != "Average"
                else row.apply(lambda x: f"\\textbf{{{x:.2f}}}" if isinstance(x, float) else "")
            ),
            axis=1,
        )
        column_format = "l|c|cc|cc" if not model.startswith("gpt_4o") else "l|c|c"
        tex_table = df.to_latex(
            float_format="%.2f",
            multicolumn_format="c|",
            column_format=f'@{{}}{column_format + ("|c" if "Zero-Shot" in df.columns else "")}@{{}}',
        )

        if not "zero_shot" in benchmarks[benchmark]:
            tex_table = tex_table.replace("|}{Target 2", "}{Target 2")
        tex_table = tex_table.replace("Average", "\\midrule Average")
        for match in ["Average", "Original", *targets, "Zero-Shot"]:
            tex_table = tex_table.replace(match, f"\\textbf{{{match}}}")

        with open(f'{outdir}/{model.split("_answer")[0]}.tex', "w") as f:
            f.write(tex_table)

        # print("\nModel:", model)
        # display(df)