In [None]:
!pip install datasets
!pip install transformers

In [None]:
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

# import argparse
import json
import nltk

from datasets import load_dataset, load_metric
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# def get_parser():
#     parser = argparse.ArgumentParser(
#         description="HumanEval based on generated samples",
#         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
#     )
#     parser.add_argument(
#         "--gen-file",
#         type=str,
#         help="Generated .jsonl file",
#     )
#     parser.add_argument(
#         "--num-samples",
#         type=int,
#         default=1,
#         help="Number of samples generated per prompt",
#     )
#     return parser

In [None]:
def generate_well(
    model: AutoModelForCausalLM,
    tok: AutoTokenizer,
    prompt: str,
    num_samples: int = 1,
    max_out_len: int = 200,
):
    """
    Fast, parallelized auto-regressive text generation with top-k sampling.
    Our custom implementation.
    """

    # Unroll prompts and tokenize
    input_ids = tok.encode(prompt, return_tensors='pt')
    if torch.cuda.is_available():
      input_ids.to('cuda')
    output_ids = model.generate(input_ids, max_length=max_out_len, num_return_sequences=num_samples, temperature=0.2, top_p=0.95)
    results = [tok.decode(output_ids[idx], truncate_before_pattern=[r"\n\n^#", r"^'''", r"\n\n\n"]) for idx in range(num_samples)]
    for result in results:
        print(result)
    return results


In [None]:
def generate():
    # parser = get_parser()
    # args = parser.parse_args()

    torch.cuda.empty_cache()

    print("build the LLM")
    SIZE = '350M'  # 350M 2B 6B
    MODEL_NAME = f"Salesforce/codegen-{SIZE}-mono"
    tok = AutoTokenizer.from_pretrained(MODEL_NAME, errors='ignore')
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    if torch.cuda.is_available():
      model.to("cuda")
    tok.pad_token = tok.eos_token

    # Load evaluation dataset and metric
    human_eval = load_dataset("openai_humaneval")
    # Generate completions for evaluation set
    n_tasks = len(human_eval["test"])
    n_tasks = 1 #debug

    prompts = []
    solutions = []
    for task in range(n_tasks):
        prompts.append(human_eval["test"][task]["prompt"])
        solutions.append(human_eval["test"][task]["canonical_solution"])

    print("build generations")
    completed_codes = list()
    for idx, (prompt, solution) in tqdm(enumerate(zip(prompts, solutions))):
        prompt = prompt.lstrip().replace("\n\n\n", "\n\n")
        print('---' * 19)
        print(f'+++{idx}', prompt)
        completed_code = generate_well(model, tok, prompt, num_samples=num_samples)
        completed_codes.append(completed_code)
        print(f'###{idx}', completed_code)

    print("save generations")
    with open(gen_file, 'w') as file:
        for completed_code in completed_codes:
            file.write(json.dumps(completed_code) + '\n')



In [None]:
def evaluate(completed_codes):
    human_eval = load_dataset("openai_humaneval")

    # Generate completions for evaluation set
    n_tasks = len(human_eval["test"])
    generations, references = [], []
    for task in tqdm(range(n_tasks)):
        for sample in completed_codes[task * num_samples:(task + 1) * num_samples]: 
            generations.append(sample)

        test_func = human_eval["test"][task]["test"]
        entry_point = f"check({human_eval['test'][task]['entry_point']})"
        references.append("\n" + test_func + "\n" + entry_point)

    # Evaluate completions with "code_eval" metric
    code_eval_metric = load_metric("code_eval")
    # print('+' * 19)
    # print(references[0])
    # print('-' * 19)
    # print(generations[0])
    # print('=' * 19)
    references = references[:len(generations)]
    print(references, "xxx", generations)
    pass_at_k, logs = code_eval_metric.compute(predictions=generations, references=references)
    print(f"Pass@1: {pass_at_k}")
    print(f"Logs: {logs}")
    return logs



In [None]:
def case_check():
    human_eval = load_dataset("openai_humaneval")

    print("load generations")
    completed_codes = []
    with open(gen_file) as file:
        for line in file:
            completed_codes.append(json.loads(line.strip()))

    # Generate completions for evaluation set
    n_tasks = len(human_eval["test"])
    prompts = []
    generations = []
    solutions = []
    for task in tqdm(range(n_tasks)):
        prompt = human_eval["test"][task]["prompt"]
        prompt = prompt.lstrip().replace("\n\n\n", "\n\n")
        prompts.append(prompt)
        for sample in completed_codes[task * num_samples:(task + 1) * num_samples]:
            generation = sample[0]
            generation = generation.replace(prompt, '')
            generations.append(generation.rstrip())

        solution = human_eval["test"][task]["canonical_solution"]
        solutions.append(solution.rstrip())

    case_dists = list()
    for idx, (generation, solution) in enumerate(zip(generations, solutions)):
        edit_dist = nltk.edit_distance(generation, solution)
        case_dists.append([idx, edit_dist])
    sorted_case_dists = sorted(case_dists, key=lambda x: x[-1])

    codegen_log = evaluate(completed_codes)

    # all cases: 0-163
    # correct cases: [0, 18, 23, 22, 28, 29, 30, 34, 35, 42, 45, 48, 55, 53, 52, 58, 60, 66]
    passed_cases = list()
    for key, value in codegen_log.items():
        detail = value[0][-1]
        if detail['passed']:
            passed_cases.append(int(key))
    print(passed_cases)

    print("save generations")
    with open(check_file, 'w') as file:
        for case in sorted_case_dists:
            idx, edit_dist = case
            status = idx in passed_cases
            prompt = prompts[idx]
            generation = generations[idx]
            solution = solutions[idx]
            report = {
                'idx': idx,
                'status': status,
                'edit_dist': edit_dist,
                'prompt': prompt,
                'generation': generation,
                'solution': solution,
            }
            file.write(json.dumps(report) + '\n')


In [None]:
def run():
    generate()
    case_check()


# Run

In [None]:
if __name__ == '__main__':
    gen_file = 'codegen_gen.jsonl'  # args.gen_file
    check_file = 'codegen_check.jsonl'  # args.check_file
    num_samples = 1  # args.num_samples

    run()

build the LLM




  0%|          | 0/1 [00:00<?, ?it/s]

build generations


0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------------------------------------------------------
+++0 from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



1it [00:23, 23.46s/it]

from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for i in range(len(numbers)):
        for j in range(i+1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False


###0 ['from typing import List\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i]




  0%|          | 0/1 [00:00<?, ?it/s]

load generations


100%|██████████| 164/164 [00:00<00:00, 2706.13it/s]


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 164/164 [00:00<00:00, 3485.90it/s]


["\n\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n\ncheck(has_close_elements)"] xxx [['from typing import List\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for i in range(len(numbers)):\n        for j in ra