In [12]:
from vllm import LLM, SamplingParams
from datasets import load_dataset
from transformers import AutoTokenizer
from evalplus.data import get_human_eval_plus
from copy import deepcopy
import re
import os

  from .autonotebook import tqdm as notebook_tqdm
2024-06-25 04:17:11,041	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Formatting Prompts + VLLM Generation

In [13]:
MODEL = "microsoft/Phi-3-mini-4k-instruct" 
MODEL_NAME = MODEL.split('/')[-1] # used to decide what to call output dir
DATASET = "verifiers-for-code/humaneval_plan" 
OUTPUT_DIR = MODEL_NAME + "-output"
NUM_GPUS = 1
COLUMN_NAME = "generated_phi3_baseline"

In [14]:
__MAGIC_SPLITTER__ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
response = f"""
Below is a self-contained Python script that solves the problem: 
```python 
{__MAGIC_SPLITTER__}
```
""" 

In [15]:
dataset = load_dataset(DATASET, split="test")

In [16]:
eplus = get_human_eval_plus()

In [17]:
llm = LLM(model=MODEL, 
          tensor_parallel_size = NUM_GPUS, 
          enable_prefix_caching=False, 
          gpu_memory_utilization=0.95, 
          max_model_len=2048, 
          trust_remote_code=True,
          max_num_seqs = 16)

tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    temperature=0, top_p=0.95, max_tokens = 512,
)

INFO 06-25 04:17:13 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='microsoft/Phi-3-mini-4k-instruct', speculative_config=None, tokenizer='microsoft/Phi-3-mini-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=microsoft/Phi-3-mini-4k-instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-25 04:17:13 utils.py:660] Found nccl from library /home/vaibhav/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 06-25 04:17:15 selector.py:27] Using FlashAttention-2 backend.
INFO 06-25 04:17:15 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 06-25 04:17:17 model_runner.py:175] Loading model weights took 7.1183 GB
INFO 06-25 04:17:18 gpu_executor.py:114] # GPU blocks: 2565, # CPU blocks: 682
INFO 06-25 04:17:22 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-25 04:17:22 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-25 04:17:23 model_runner.py:1017] Graph capturing f

In [18]:
def create_none_prompts(prompt):
    prompt = f"Please provide a self-contained Python script that solves the following problem in a markdown code block. Follow the given plan.\n```\n{prompt.strip()}\n```\n" 
    x = tokenizer.apply_chat_template(
         [ 
             {"role": "user", "content": prompt}, 
             {"role": "assistant", "content": response}, 
         ], 
    tokenize=False).split(__MAGIC_SPLITTER__)[0]
    return x

In [19]:
none_prompts = []

for heval_task in eplus.keys():
    pr = eplus[heval_task]['prompt']
    pr = create_none_prompts(pr)           
    none_prompts.append(pr)

In [20]:
print(none_prompts[0])

<s><|user|>
Please provide a self-contained Python script that solves the following problem in a markdown code block. Follow the given plan.
```
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
```
<|end|>
<|assistant|>

Below is a self-contained Python script that solves the problem: 
```python 



In [21]:
def get_vllm_code(px):
    outputs = llm.generate(px, sampling_params)
    outputs = [x.outputs[0].text for x in outputs]
    return outputs

In [22]:
none_prompts_sol = get_vllm_code(none_prompts)

Processed prompts: 100%|██████████| 164/164 [01:14<00:00,  2.21it/s]


In [35]:
# push to hf
def update_dataset_with_solutions(dataset, new_column_name, solutions):
    
    dataset = dataset.add_column(new_column_name, solutions)
    
    return dataset

In [36]:
dataset = update_dataset_with_solutions(dataset, COLUMN_NAME, none_prompts_sol)

In [None]:
dataset.push_to_hub(DATASET)

# VLLM Code Generation

In [24]:
def extract_clean_code(text):
    # Find the index of the triple backticks
    index = text.find("```")
    
    # If the triple backticks are found, slice the string up to that point
    if index != -1:
        text = text[:index]
    
    # Split the text into lines
    lines = text.splitlines()
    
    # Reverse the list to find the last "return" from the end
    for i, line in enumerate(reversed(lines)):
        if "return" in line:
            # Find the line number of the last "return" statement
            last_return_index = len(lines) - i - 1
            # Return the text up to and including the last "return" line
            return '\n'.join(lines[:last_return_index+1])
    
    # If no "return" is found, or no backticks are found, return the original text
    return text

In [25]:
# def get_code_to_eval(dataset_name, code_column_name):
#     return dataset_name[code_column_name]

In [26]:
none_solutions = [extract_clean_code(code) for code in none_prompts_sol]

In [27]:
print(none_solutions[1])

from typing import List

def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """
    # Remove spaces from the input string
    paren_string = paren_string.replace(" ", "")

    # Initialize variables
    result = []
    current_group = ""
    open_braces = 0

    for char in paren_string:
        if char == "(":
            open_braces += 1
            current_group += char
        elif char == ")":
            open_braces -= 1
            if open_braces == 0:
                result.append(current_group)
                current_group = ""
        else:
            conti

In [30]:
for index in range(len(none_solutions)):
    name = f"HumanEval_{index}"
    os.makedirs(os.path.join(f"{OUTPUT_DIR}/none", name), exist_ok=True)
    with open(os.path.join(f"{OUTPUT_DIR}/none", name, '0.py'), 'w', encoding='utf-8') as f:
        f.write(none_solutions[index])

# Eval Check

In [31]:
import json

def convert_to_jsonl(solutions, output_file):
    with open(output_file, 'w') as f:
        for i, solution in enumerate(solutions):
            json_line = json.dumps({"task_id": f"HumanEval/{i}", "completion": solution})
            f.write(json_line + '\n')

In [32]:
convert_to_jsonl(none_solutions, f"{OUTPUT_DIR}/none/solutions.jsonl")

In [33]:
!evalplus.evaluate --dataset humaneval --samples $OUTPUT_DIR/none 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Computing expected output...
Expected outputs computed in 18.49s
Reading samples...
164it [00:00, 403.92it/s]
100%|█████████████████████████████████████████| 164/164 [00:07<00:00, 20.90it/s]
[31mhumaneval (base tests)[0m
[31mpass@1:	0.628[0m
[32mhumaneval+ (base + extra tests)[0m
[32mpass@1:	0.591[0m
