In [1]:
from rosemary import jpt_parse_args, jpt_setup; jpt_setup()

import platform
import sys
sys.path.append('/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/'
                if platform.uname().processor == 'x86_64' 
                else '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

  warn(f'Install `torch` for functionalities dependent on torch')


In [2]:
import argparse
import os
import json
import random
import torch
from eval.utils import generate_completions, load_hf_lm_and_tokenizer, query_openai_chat_model
from eval.codex_humaneval.data import write_jsonl, read_problems
from eval.codex_humaneval.evaluation import evaluate_functional_correctness
from transformers import GPT2LMHeadModel


[2023-09-23 19:20:37,837] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:

parser = argparse.ArgumentParser()
parser.add_argument("--data_file", type=str, default="data/codex_eval/HumanEval.jsonl.gz", help="Path to the HumanEval data file.")
parser.add_argument("--max_num_examples",  type=int, default=None, help="Maximum number of examples to evaluate.")
parser.add_argument("--model_name_or_path", type=str, default=None, help="If specified, we will load the model to generate the predictions.")
parser.add_argument("--tokenizer_name_or_path", type=str, default=None, help="If specified, we will load the tokenizer from here.")
parser.add_argument("--use_slow_tokenizer", action="store_true", help="If given, we will use the slow tokenizer.")
parser.add_argument("--openai_engine", type=str, default=None, help="If specified, we will use the OpenAI API to generate the predictions.")
parser.add_argument("--save_dir", type=str, default="results/codex_eval", help="Directory to save the results.")
parser.add_argument("--eval_batch_size", type=int, default=1, help="Batch size for evaluation.")
parser.add_argument("--eval_pass_at_ks", nargs="+", type=int, default=[1], help="Multiple k's that we will report pass@k.")
parser.add_argument("--unbiased_sampling_size_n", type=int, default=20,help="Codex HumanEval requires `n` sampled generations per prompt, to estimate the unbiased pass@k. ")
parser.add_argument("--temperature", type=float, default=0.1,help="Temperature for sampling. This is should be low for evaluating smaller pass@k, and high for larger pass@k.")
parser.add_argument("--load_in_8bit", action="store_true", help="Load model in 8bit mode, which will reduce memory and speed up inference.")
parser.add_argument("--gptq", action="store_true", help="If given, we're evaluating a 4-bit quantized GPTQ model.")
parser.add_argument("--use_chat_format", action="store_true", help="If given, the prompt will be encoded as a chat format with the roles in prompt.")
parser.add_argument("--chat_formatting_function", type=str, default="eval.templates.create_prompt_with_tulu_chat_format", help="The function to use to create the chat format. This function will be dynamically imported. Please see examples in `eval/templates.py`.")
parser.add_argument("--max_new_tokens", type=int, default=256)

model_name_or_path = '../results/baselines/huggyllama/llama-7b'
# model_name_or_path = '../results/baselines/gpt2-medium'

#     --eval_batch_size 10
cmd = f"""
    --data_file data/eval/codex_humaneval/HumanEval.jsonl.gz \
    --eval_pass_at_ks 1 5 \
    --unbiased_sampling_size_n 5 \
    --temperature 0.1 \
    --save_dir {model_name_or_path}/eval/humaneval/ \
    --model_name_or_path {model_name_or_path} \
    --eval_batch_size 50 \
    --use_chat_format
"""

args = jpt_parse_args(parser, cmd)


# model_name_or_path and openai_engine cannot be both None or both not None.
assert (args.model_name_or_path is None) != (args.openai_engine is None), "Either model_name_or_path or openai_engine should be specified."
assert args.unbiased_sampling_size_n >= max(args.eval_pass_at_ks), "n should be larger than the largest k in eval_pass_at_ks."
args

Namespace(data_file='data/eval/codex_humaneval/HumanEval.jsonl.gz', max_num_examples=None, model_name_or_path='../results/baselines/huggyllama/llama-7b', tokenizer_name_or_path=None, use_slow_tokenizer=False, openai_engine=None, save_dir='../results/baselines/huggyllama/llama-7b/eval/humaneval/', eval_batch_size=50, eval_pass_at_ks=[1, 5], unbiased_sampling_size_n=5, temperature=0.1, load_in_8bit=False, gptq=False, use_chat_format=True, chat_formatting_function='eval.templates.create_prompt_with_tulu_chat_format', max_new_tokens=256)

In [9]:
print("Loading model and tokenizer...")
model, tokenizer = load_hf_lm_and_tokenizer(
    model_name_or_path=args.model_name_or_path, 
    tokenizer_name_or_path=args.tokenizer_name_or_path, 
    load_in_8bit=args.load_in_8bit, 
    # device map is determined by the number of gpus available.
    device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
    gptq_model=args.gptq,
    use_fast_tokenizer=True,
)
model.device, model.dtype

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


(device(type='cuda', index=0), torch.float16)

In [10]:
if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir, exist_ok=True)
    
# wpq: for gpt-2 model, need to enforce `max_length` constraints to avoid `position_id` index errors.
if isinstance(model, GPT2LMHeadModel):
    max_input_seq_len = model.config.max_position_embeddings - args.max_new_tokens
else:
    max_input_seq_len = 2048 - args.max_new_tokens


test_data = list(read_problems(args.data_file).values())
if args.max_num_examples is not None and len(test_data) > args.max_num_examples:
    test_data = random.sample(test_data, args.max_num_examples)
print("Number of examples:", len(test_data))


# def format_prompt(x):
#     if args.use_chat_format:
#         return "<|user|>\n" + "Complete the following python function.\n\n\n" + \
#                 x + \
#                 "\n<|assistant|>\n" + "Here is the completed function:\n\n\n"
#     else:
#         return x


# prompts = []

# for example in test_data:
#     prompt = format_prompt(example['prompt'])
#     tokenized_prompt_len = len(tokenizer(prompt, add_special_tokens=False)['input_ids'])
#     if tokenized_prompt_len >= max_input_seq_len:
#         print('here', tokenized_prompt_len)
#     prompts.append(prompt)
    
# print(prompts[0])

Number of examples: 164


In [11]:
if args.use_chat_format:
    prompts = [
        "<|user|>\n" + "Complete the following python function.\n\n\n" 
        + example["prompt"] + "\n<|assistant|>\n" + "Here is the completed function:\n\n\n"
        for example in test_data
    ]
else:
    prompts = [example["prompt"] for example in test_data]

print(tokenizer(prompts, return_tensors='pt', padding=True).input_ids.shape)

torch.Size([164, 464])


In [16]:
# print(prompts[0])
prompts = [
            "<|user|>\n" + "Complete the following python function.\n\n\n" 
            + example["prompt"] + "\n<|assistant|>\n" + "Here is the completed function:\n\n\n" + example["prompt"]
            for example in test_data
        ]

print(tokenizer(prompts, return_tensors='pt', padding=True).input_ids.shape)

torch.Size([164, 895])


In [18]:
prompts_subset = prompts[:5]  # wpq: do very small batches.
prompts_subset = prompts

# these stop sequences are those mentioned in the codex paper.
stop_sequences = ["\nclass", "\ndef", "\n#", "\nif", "\nprint"]
# Because many tokenizers will treat the word after space differently from the original word alone, 
# to be consistent, we add a space before tokenization and remove it after tokenization.
stop_sequences = [tokenizer.encode(" " + x, add_special_tokens=False)[1:] for x in stop_sequences]

from transformers import GPT2LMHeadModel
if isinstance(model, GPT2LMHeadModel):
    generation_kwargs = {'max_length': model.config.max_position_embeddings} # 1024
else:
    generation_kwargs = {'max_new_tokens': 512}


outputs_per_sampling_iter = []
for sampling_iter in range(args.unbiased_sampling_size_n):
    print(f"Sampling iter: {sampling_iter} / {args.unbiased_sampling_size_n}")
    samping_outputs = generate_completions(
        model=model,
        tokenizer=tokenizer,
        prompts=prompts_subset,
        batch_size=args.eval_batch_size,
        stop_id_sequences=stop_sequences,
        num_return_sequences=1,  # we don't use the hf num_return_sequences, because otherwise the real batch size will be multiplied by it and often cause oom.
        do_sample=True,  # if only pass@1 is evaluated, we do greedy decoding.
        top_p=0.95,
        temperature=args.temperature,
        **generation_kwargs,
    )
    outputs_per_sampling_iter.append(samping_outputs)
# regroup the outputs to match the number of test data.
outputs = []
for i in range(len(prompts_subset)):
    for j in range(args.unbiased_sampling_size_n):
        outputs.append(outputs_per_sampling_iter[j][i])

Sampling iter: 0 / 5




Generating Completions:   0%|          | 0/164 [00:00<?, ?it/s][A[ASetting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

KeyboardInterrupt



In [20]:
tokenizer(prompts, return_tensors='pt', padding=True).input_ids.shape

torch.Size([164, 628])

In [27]:

outputs = []
for i in range(len(prompts_subset)):
    for j in range(args.unbiased_sampling_size_n):
        outputs.append(outputs_per_sampling_iter[j][i])

In [26]:
print(prompts[0])
for i in range(len(outputs_per_sampling_iter[0])):
    print(i)
    print(outputs_per_sampling_iter[0][i])

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

0
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False


1
    paren_groups = []
    for char in paren_string:
        if char == '(':
            paren_groups.append(paren_groups[-1])
        elif char == ')':
            paren_groups[-1] += paren_groups[-1]
            paren_groups.pop()
    return paren_groups


2
    return number - int(number)


3
    for operation in operations:
        if operation < 0:
            return True
    return False


4
    return sum(abs(x - mean(numbers)) for x in numbers) / len(nu

In [33]:
len(test_data)

164

In [39]:

# duplicates test data to match the number of outputs.
duplicate_test_data = [
    example for example in test_data[:5] for _ in range(args.unbiased_sampling_size_n)
]
assert len(duplicate_test_data) == len(outputs)
predictions = [{"task_id": example["task_id"], "prompt": example["prompt"], "completion": output} 
               for example, output in zip(duplicate_test_data, outputs)]
prediction_save_path = os.path.join(args.save_dir, "codex_eval_predictions.jsonl")
write_jsonl(prediction_save_path, predictions)

pass_at_k_results = evaluate_functional_correctness(
    sample_file=prediction_save_path,
    k=args.eval_pass_at_ks,
    problems={example["task_id"]: example for example in test_data[:5]},
)

print(pass_at_k_results)

# with open(os.path.join(args.save_dir, "metrics.json"), "w") as fout:
#     json.dump(pass_at_k_results, fout)


Reading samples...



25it [00:00, 6089.29it/s]


Running test suites...



  0%|          | 0/25 [00:00<?, ?it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


  4%|▍         | 1/25 [00:00<00:03,  6.10it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


 20%|██        | 5/25 [00:00<00:01, 16.82it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 36%|███▌      | 9/25 [00:00<00:00, 21.04it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av




	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 52%|█████▏    | 13/25 [00:00<00:00, 25.43it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 64%|██████▍   | 16/25 [00:00<00:00, 24.20it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


 76%|███████▌  | 19/25 [00:00<00:00, 23.10it/s][A

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


100%|██████████| 25/25 [00:01<00:00, 24.11it/s][A


Writing results to ../results/baselines/huggyllama/llama-7b/eval/humaneval/codex_eval_predictions.jsonl_results.jsonl...



100%|██████████| 25/25 [00:00<00:00, 22755.56it/s]

{'pass@1': 0.4, 'pass@5': 0.4}





'../results/baselines/huggyllama/llama-7b/eval/humaneval/codex_eval_predictions.jsonl'