In [13]:
!git clone https://github.com/maoquan-ms/coding-tasks.git data/coding-tasks

Cloning into 'coding-tasks'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 30 (delta 6), reused 7 (delta 4), pack-reused 18 (from 1)[K
Unpacking objects: 100% (30/30), 47.54 KiB | 4.75 MiB/s, done.


In [2]:
from human_eval.data import write_jsonl, read_problems
from langchain.prompts import PromptTemplate
import requests
from tqdm.notebook import tqdm

we are not using complex prompts or agent functionalities, so let's have a direct call to the vllm service

In [3]:
url = 'http://0.0.0.0:8000/generate'
with open('../data/prompt_coder.txt', 'r') as file:
    template_content = file.read()
prompt_template = PromptTemplate(
    input_variables=["problem_text"],
    template=template_content)

def process_output(response: str, task_description: str) -> str:
    """
    remove the prompt from a generation result. HumanEval only requires the generated part for it's evaluation.

    :param response: the generation result of an LLM
    :param task_description: the prompt and function head string
    :return:
    """
    response = response.replace(task_description, "").strip()
    return response


def generate_single(problem_text: str, temp=.2):
    """
    The main inference function

    :param problem_text: the code problem without any preprocessing
    :param temp: generation temperature
    :return:
    """
    assembled_prompt = prompt_template.format(problem_text=problem_text)
    
    data = {
        "prompt": assembled_prompt,
        "max_tokens": 512,
        "temperature": temp,
        "stop": ["\n\n", "\ndef", "\nclass "],
        "top_k": 40, 
        "top_p": 0.85
    }
    
    response = requests.post(url, json=data)
    
    if response.status_code == 200:
        result = response.json()
        generated_code = result["text"]

        return [process_output(v, assembled_prompt) for v in generated_code]
    else:
        return ['pass']

def generate_multiple(problem_text: str, n=5, temp=.2):
    rslt = []
    for _ in range(n):
        rslt.extend(generate_single(problem_text, temp))
    return rslt


Let's inference with different generation temperatures.

In [5]:
problems = read_problems('../data/coding-tasks/datasets/humaneval/HumanEval.jsonl')
temps = [.2, .4, .6, .8]
ns = 10
for t in temps:
    samples = []
    for task_id in tqdm(problems):
        for _ in range(ns):
            samples.append(dict(task_id=task_id, completion=generate_single(problems[task_id]["prompt"], t)[0]))


    write_jsonl(f"samples_{t}_{ns}.jsonl", samples)



  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]