In [None]:
from datasets import load_dataset
from service import Service
from utils import show_diff
import json
import time
import os

# possible RAG strategies: 
# "no_rag" - no RAG strategy, only the question is used
# "docs" - ground-truth docs are provided in the context
# "serpapi" - search engine results are provided in the context
# "serpapi_embedding" - search engine results are obtained and embedded for doing RAG

main_folder = "code_rag_bench"
dataset_name = "humaneval"
model = "gpt-4o"
rag_strategy = "docs"
eval_folder = f"{main_folder}/{dataset_name}/{model.replace('-', '_').replace('/', '_')}/{rag_strategy}"
failed_folder = f"{eval_folder}/failed"
os.makedirs(eval_folder, exist_ok=True)
os.makedirs(failed_folder, exist_ok=True)

s = Service(seed=42, get_usage=True, get_diff=True)

dataset = load_dataset(f"code-rag-bench/{dataset_name}")["train"]
df = dataset.to_polars()

In [None]:
def compute_price(model, in_tk, out_tk):
    prices_per_1m = {
        "gpt-4o": 
        {"input": 2.5, "output": 10.0},
        "gpt-4o-mini": 
        {"input": 0.15, "output": 0.6},
        "meta-llama/Llama-3.2-3B-Instruct-Turbo": 
        {"input": 0.06, "output": 0.06},
        "meta-llama/Llama-3-8b-chat-hf":
        {"input": 0.2, "output": 0.2},
        "meta-llama/Llama-3-70b-chat-hf":
        {"input": 0.88, "output": 0.88},
        "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF":
        {"input": 0.88, "output": 0.88},
        "Qwen/Qwen2.5-Coder-32B-Instruct":
        {"input": 0.8, "output": 0.8},
        "codellama/CodeLlama-34b-Instruct-hf":
        {"input": 0.78, "output": 0.78},
        "upstage/SOLAR-10.7B-Instruct-v1.0":
        {"input": 0.3, "output": 0.3},
        "google/gemma-2-27b-it":
        {"input": 0.8, "output": 0.8},
        "google/gemma-2-9b-it":
        {"input": 0.3, "output": 0.3},
        "deepseek-ai/deepseek-llm-67b-chat":
        {"input": 0.9, "output": 0.9},
    }

    if model not in prices_per_1m:
        raise ValueError(f"Model {model} not found in the pricing list")

    input_cost = (in_tk / 1e6) * prices_per_1m[model]["input"]
    output_cost = (out_tk / 1e6) * prices_per_1m[model]["output"]

    return input_cost + output_cost

## Get the Responses

In [3]:
RESPONSES_FILE = f"./{eval_folder}/benchmark.json"

r = {"data":[], "checkpoint": 0, "input_tokens": 0, "output_tokens": 0, "total_price": 0, "price_per_req": 0}
if os.path.exists(RESPONSES_FILE):
    with open(RESPONSES_FILE, "r") as f:
        r = json.load(f)
else:
    with open(RESPONSES_FILE, "w") as f:
        f.write(json.dumps(r, indent=4))

In [4]:
assert r['checkpoint'] == len(r['data']), "Checkpoint does not match with executed requests"
print(f"Done {r['checkpoint']}/{len(df)} ({100*r['checkpoint']/len(df):.2f}%)")

Done 0/164 (0.00%)


## HumanEval

In [None]:
for d in df[r['checkpoint']:].iter_rows(named=True):
    print(f"Task ID: {d['task_id']}")
    code = d['prompt']
    gt_docs = d['docs'][0]['text'] if rag_strategy == 'docs' else None
    response = s.process(None, code, gt_docs, model)
    #show_diff(response['diff'])

    try: idx_diff = max(i for i, item in enumerate(response['diff']) if item[0] == "insert")
    except: idx_diff = -1
    r["data"].append({
        "solution": response['updatedCode'],
        "only_solution": response['diff'][idx_diff][-1] if idx_diff >=0 else response['updatedCode'],
    })

    r["input_tokens"] += response["usage"]["input"]
    r["output_tokens"] += response["usage"]["output"]
    r["checkpoint"] += 1

    if r['checkpoint'] % 10 == 0:
        price = compute_price(model, r['input_tokens'], r['output_tokens'])
        r['total_price'] = round(price, 4)
        r['price_per_req'] = round(price / r['checkpoint'], 6)
        with open(RESPONSES_FILE, "w") as f:
            f.write(json.dumps(r, indent=4))
        print(f"SAVED ({r['checkpoint']})")

price = compute_price(model, r['input_tokens'], r['output_tokens'])
r['total_price'] = round(price, 4)
r['price_per_req'] = round(price / r['checkpoint'], 8)
with open(RESPONSES_FILE, "w") as f:
    f.write(json.dumps(r, indent=4))

Task ID: HumanEval/139
SAVED (140)
Task ID: HumanEval/140
Task ID: HumanEval/141
Task ID: HumanEval/142
Task ID: HumanEval/143
Task ID: HumanEval/144
Task ID: HumanEval/145
Task ID: HumanEval/146
Task ID: HumanEval/147
Task ID: HumanEval/148
Task ID: HumanEval/149
SAVED (150)
Task ID: HumanEval/150
Task ID: HumanEval/151
Task ID: HumanEval/152
Task ID: HumanEval/153
Task ID: HumanEval/154
Task ID: HumanEval/155
Task ID: HumanEval/156
Task ID: HumanEval/157
Task ID: HumanEval/158
Task ID: HumanEval/159
SAVED (160)
Task ID: HumanEval/160
Task ID: HumanEval/161
Task ID: HumanEval/162
Task ID: HumanEval/163


## Evaluate the Responses

In [None]:
failed_tasks = []
log = ""

for row,_ in zip(df[:r['checkpoint']].iter_rows(named=True), r['data']):
    row.update(_)

    with open(f"./{main_folder}/solution.py", "w", encoding='utf-8') as f:
        if not row['solution'].startswith(('def', 'class', 'import', 'from')):
            f.write(row['prompt']+'\n')
        f.write(row['solution'])
    with open(f"./{main_folder}/canonical_solution.py", "w", encoding='utf-8') as f:
        f.write(row['prompt']+row['canonical_solution'])
    with open(f"./{main_folder}/evaluate.py", "w", encoding='utf-8') as f:
        f.write(f"from {main_folder}.canonical_solution import *\n")
        f.write(row['test'])
    
    try:
        exec(open(f"./{main_folder}/canonical_solution.py").read())
        exec(open(f"./{main_folder}/solution.py").read())
        exec(open(f"./{main_folder}/evaluate.py").read())
        globals()['check'](globals()[row['entry_point']])
    except Exception as e:
        id = row['task_id']
        realid = id.split('/')[1]
        log += f"""Error ({id}): [{str(e.__class__).replace("<class '", "").replace("'>", "")}] {e}\n"""
        # renam solution and evaluate files for the failed task, so they can be debugged
        if not os.path.exists(f"./{failed_folder}/c{realid}_solution.py"):
            os.rename(f"./{main_folder}/solution.py", f"./{failed_folder}/c{realid}_solution.py")
        if not os.path.exists(f"./{failed_folder}/c{realid}_canonical_solution.py"):
            os.rename(f"./{main_folder}/canonical_solution.py", f"./{failed_folder}/c{realid}_canonical_solution.py")
        if not os.path.exists(f"./{failed_folder}/c{realid}_evaluate.py"):
            with open(f"./{failed_folder}/c{realid}_evaluate.py", "w", encoding='utf-8') as f:
                f.write(f"from c{realid}_canonical_solution import *\n")
                f.write(f"from c{realid}_solution import {row['entry_point']} as proposed\n")
                f.write(row['test'])
                f.write(f"\ncheck(proposed)")
        failed_tasks.append(id)

success = r['checkpoint'] - len(failed_tasks)
log += f"Accuracy: {success}/{r['checkpoint']} ({success/r['checkpoint']:.2%})\n"
log += f"Failed tasks: {failed_tasks}"

with open(f"{eval_folder}/benchmark.log", "w") as f:
    f.write(log)

# remove the temporary files
if os.path.exists(f"./{main_folder}/solution.py"):
    os.remove(f"./{main_folder}/solution.py")
if os.path.exists(f"./{main_folder}/evaluate.py"):
    os.remove(f"./{main_folder}/evaluate.py")
if os.path.exists(f"./{main_folder}/canonical_solution.py"):
    os.remove(f"./{main_folder}/canonical_solution.py")