In [1]:
from vllm import LLM, SamplingParams
from datasets import load_dataset
from transformers import AutoTokenizer
from evalplus.data import get_human_eval_plus
import os
import json

  from .autonotebook import tqdm as notebook_tqdm
2024-06-26 01:06:01,376	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Formatting Prompts + VLLM Generation

In [2]:
MODEL = "microsoft/Phi-3-mini-4k-instruct" 
MODEL_NAME = MODEL.split('/')[-1] # used to decide what to call output dir
DATASET = "verifiers-for-code/humaneval_plan_generation" 
OUTPUT_DIR = MODEL_NAME + "-output"
NUM_GPUS = 1
COLUMN_NAME = "generated_phi3_baseline"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
__MAGIC_SPLITTER__ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
response = f"""
Below is a self-contained Python script that solves the problem: 
```python 
{__MAGIC_SPLITTER__}
```
""" 

In [4]:
dataset = load_dataset(DATASET, split="test")

In [5]:
eplus = get_human_eval_plus()

In [6]:
llm = LLM(model=MODEL, 
          tensor_parallel_size = NUM_GPUS, 
          enable_prefix_caching=False, 
          gpu_memory_utilization=0.95, 
          max_model_len=2048, 
          trust_remote_code=True,
          max_num_seqs = 16)

tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    temperature=0, top_p=0.95, max_tokens = 512,
)

INFO 06-26 01:06:07 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='microsoft/Phi-3-mini-4k-instruct', speculative_config=None, tokenizer='microsoft/Phi-3-mini-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=microsoft/Phi-3-mini-4k-instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-26 01:06:07 utils.py:660] Found nccl from library /home/vaibhav/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 06-26 01:06:08 selector.py:27] Using FlashAttention-2 backend.
INFO 06-26 01:06:09 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 06-26 01:06:11 model_runner.py:175] Loading model weights took 7.1183 GB
INFO 06-26 01:06:12 gpu_executor.py:114] # GPU blocks: 2565, # CPU blocks: 682
INFO 06-26 01:06:16 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-26 01:06:16 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-26 01:06:17 model_runner.py:1017] Graph capturing f

In [7]:
def create_none_prompts(prompt):
    prompt = f"Please provide a self-contained Python script that solves the following problem in a markdown code block. Follow the given plan.\n```\n{prompt.strip()}\n```\n" 
    x = tokenizer.apply_chat_template(
         [ 
             {"role": "user", "content": prompt}, 
             {"role": "assistant", "content": response}, 
         ], 
    tokenize=False).split(__MAGIC_SPLITTER__)[0]
    return x

In [13]:
# none_prompts = []

# for heval_task in eplus.keys():
#     print(eplus[heval_task].keys())
#     pr = eplus[heval_task]['cleaned_sonnet-3.5_gold_plans']
#     pr = create_none_prompts(pr)           
#     none_prompts.append(pr)

none_prompts = []
for task in dataset:
    none_prompts.append(create_none_prompts(task['cleaned_sonnet-3.5_gold_plans']))

In [16]:
print(none_prompts[-1])

<s><|user|>
Please provide a self-contained Python script that solves the following problem in a markdown code block. Follow the given plan.
```
def generate_integers(a, b):
    """
    Given two positive integers a and b, return the even digits between a
    and b, in ascending order.

    For example:
    generate_integers(2, 8) => [2, 4, 6, 8]
    generate_integers(8, 2) => [2, 4, 6, 8]
    generate_integers(10, 14) => []

    Action Plan:
    1. Determine the lower and upper bounds:
       - Consider that a and b can be in any order
       - The lower bound should be the smaller of a and b, but not less than 2
       - The upper bound should be the larger of a and b, but not more than 8
       - Hint: Use min() and max() functions to handle this logic

    2. Generate a list of even integers:
       - Create a list comprehension that:
         a. Iterates through the range from the lower bound to the upper bound (inclusive)
         b. Includes only even numbers in the result
     

In [17]:
def get_vllm_code(px):
    outputs = llm.generate(px, sampling_params)
    outputs = [x.outputs[0].text for x in outputs]
    return outputs

In [18]:
none_prompts_sol = get_vllm_code(none_prompts)

Processed prompts: 100%|██████████| 164/164 [01:23<00:00,  1.96it/s]


In [14]:
# # push to hf
# def update_dataset_with_solutions(dataset, new_column_name, solutions):
    
#     dataset = dataset.add_column(new_column_name, solutions)
    
#     return dataset

In [15]:
# dataset = update_dataset_with_solutions(dataset, COLUMN_NAME, none_prompts_sol)

ValueError: The table can't have duplicated columns but columns ['generated_phi3_baseline'] are duplicated.

In [None]:
# dataset.push_to_hub(DATASET)

# VLLM Code Generation

In [19]:
def extract_clean_code(text):
    # Find the index of the triple backticks
    index = text.find("```")
    
    # If the triple backticks are found, slice the string up to that point
    if index != -1:
        text = text[:index]
    
    # Split the text into lines
    lines = text.splitlines()
    
    # Reverse the list to find the last "return" from the end
    for i, line in enumerate(reversed(lines)):
        if "return" in line:
            # Find the line number of the last "return" statement
            last_return_index = len(lines) - i - 1
            # Return the text up to and including the last "return" line
            return '\n'.join(lines[:last_return_index+1])
    
    # If no "return" is found, or no backticks are found, return the original text
    return text

In [20]:
# def get_code_to_eval(dataset_name, code_column_name):
#     return dataset_name[code_column_name]

In [27]:
none_solutions = [extract_clean_code(code) for code in none_prompts_sol]

In [28]:
print(none_solutions[-1])

def generate_integers(a, b):
    """
    Given two positive integers a and b, return the even digits between a
    and b, in ascending order.

    For example:
    generate_integers(2, 8) => [2, 4, 6, 8]
    generate_integers(8, 2) => [2, 4, 6, 8]
    generate_integers(10, 14) => []

    Action Plan:
    1. Determine the lower and upper bounds:
       - Consider that a and b can be in any order
       - The lower bound should be the smaller of a and b, but not less than 2
       - The upper bound should be the larger of a and b, but not more than 8
       - Hint: Use min() and max() functions to handle this logic

    2. Generate a list of even integers:
       - Create a list comprehension that:
         a. Iterates through the range from the lower bound to the upper bound (inclusive)
         b. Includes only even numbers in the result
       - Hint: Use the modulo operator (%) to check for even numbers

    3. Return the resulting list

    Remember to handle edge cases where there 

In [29]:
for index in range(len(none_solutions)):
    name = f"HumanEval_{index}"
    os.makedirs(os.path.join(f"{OUTPUT_DIR}/gold_plans", name), exist_ok=True)
    with open(os.path.join(f"{OUTPUT_DIR}/gold_plans", name, '0.py'), 'w', encoding='utf-8') as f:
        f.write(none_solutions[index])

# Eval Check

In [30]:
def convert_to_jsonl(solutions, output_file):
    with open(output_file, 'w') as f:
        for i, solution in enumerate(solutions):
            json_line = json.dumps({"task_id": f"HumanEval/{i}", "completion": solution})
            f.write(json_line + '\n')

In [31]:
convert_to_jsonl(none_solutions, f"{OUTPUT_DIR}/gold_plans/solutions.jsonl")

In [33]:
!evalplus.evaluate --dataset humaneval --samples $OUTPUT_DIR/gold_plans

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Load from ground-truth from /home/vaibhav/.cache/evalplus/84f4b93a1270b492e4c54d5212da7a5b.pkl
Reading samples...
164it [00:00, 450.00it/s]
100%|█████████████████████████████████████████| 164/164 [00:08<00:00, 19.52it/s]
[31mhumaneval (base tests)[0m
[31mpass@1:	0.841[0m
[32mhumaneval+ (base + extra tests)[0m
[32mpass@1:	0.774[0m
