In [15]:
from human_eval.data import read_problems, write_jsonl

problems = read_problems()
print("Number of HumanEval problems:", len(problems))

# Inspect one problem
example_task = "HumanEval/1"
print("Example task keys:", problems[example_task].keys())
print("\nPrompt:\n", problems[example_task]["prompt"])


Number of HumanEval problems: 164
Example task keys: dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test'])

Prompt:
 from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """



## HELPERS

In [16]:
import re
import math
import time
import tempfile
import subprocess
import requests
from pathlib import Path
from typing import List, Dict
import os
import json
from tqdm import tqdm



def get_failed_results(file_path):
    """
    Reads a JSONL results file and extracts all entries where 'passed' is False.
    """
    failed_results = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            if not data.get("passed", False):
                failed_results.append(data)
    return failed_results


def extract_clean_code(generated_text: str) -> str:
    """
    Extracts and cleans Python code from an AI-generated response.
    """
    code_blocks = re.findall(r"```(?:python)?\s*(.*?)\s*```", generated_text, re.DOTALL)
    if code_blocks:
        return max(code_blocks, key=len).strip()
    
    lines = generated_text.splitlines()
    code_started = False
    code_lines = []
    for line in lines:
        if not code_started and line.lstrip().startswith(('def ', 'class ', 'import ', 'from ')):
            code_started = True
        if code_started:
            code_lines.append(line)
    if code_lines:
        return "\n".join(code_lines).strip()
    
    return generated_text.strip()


def strip_function_def(prompt: str, generated_code: str) -> str:
    """
    Robustly removes the function signature and imports to prevent nesting errors.
    """
    # 1. Extract the function name from the prompt to know what to look for
    # (Assuming standard HumanEval prompt format ending in 'def func_name(...):')
    func_name_match = re.search(r"def\s+(\w+)\s*\(", prompt)
    func_name = func_name_match.group(1) if func_name_match else None

    lines = generated_code.splitlines()
    cleaned_lines = []
    
    # 2. Skip lines until we pass the function definition
    inside_function = False
    
    for line in lines:
        # If we see the function definition again, ignore it and toggle flag
        if func_name and f"def {func_name}" in line:
            inside_function = True
            continue # Skip the signature line itself
        
        # If we haven't found the def yet, and the line is an import or comment, keep it
        # BUT standard HumanEval usually expects indented code immediately.
        # Safer strategy: Only keep lines that are indented (the body)
        if line.startswith(" ") or line.startswith("\t"):
            cleaned_lines.append(line)
            
    # Fallback: if the sophisticated strip failed, use the simple one or raw
    if not cleaned_lines:
        return generated_code

    return "\n".join(cleaned_lines)


## BASIC CODE GENERATION

In [24]:
import time
import requests
import tqdm

def generate_one_completion_basic(prompt: str, model: str) -> str:
    """
    Uses local Ollama to generate a HumanEval-compatible completion.
    Returns ONLY the function body (indented).
    """

    url = "http://127.0.0.1:11434/api/generate"

    system_msg = (
        "You are a Python coding assistant.\n"
        "Complete the function below.\n"
        "Rules:\n"
        "- Output ONLY Python code\n"
        "- Do NOT repeat the function signature\n"
        "- Do NOT add comments or explanations\n"
    )

    payload = {
        "model": model,
        "prompt": system_msg + "\n\n" + prompt,
        "temperature": 0.1,
        "num_predict": 400,
        "stream": False
    }

    try:
        resp = requests.post(url, json=payload, timeout=300)
        resp.raise_for_status()
        raw = resp.json().get("response", "").strip()

        if not raw:
            raise RuntimeError("Empty response from Ollama")

        cleaned = extract_clean_code(raw)
        print ("----------------------------------")
        print (f"Raw generated code:\n{raw}\n")
        print("-------------------------------------")
        print (f"Cleaned generated code:\n{cleaned}\n")
        print ("----------------------------------")

        completion = strip_function_def(prompt, cleaned)
        print (f"Final completion after stripping function def:\n{completion}\n")
        return completion

    except Exception as e:
        print("[Generation error]", e)
        return "    pass"


def generate_pass_k_samples(
    problems,
    model_name: str,
    k: int
):
    """
    Generates k independent samples per task (pass@k).
    """

    samples = []

    for task_id in tqdm(problems, desc=f"Generating pass@{k} samples"):
        prompt = problems[task_id]["prompt"]

        for _ in range(k):
            completion = generate_one_completion_basic(
                prompt,
                model_name
            )

            samples.append({
                "task_id": task_id,
                "completion": completion
            })

    return samples



In [25]:
from tqdm import tqdm

# ==============================
# SETTINGS
# ==============================

model_name = "codellama:7b"
pass_k = 1
NUM_PROBLEMS = 3   # ðŸ”¥ Change this to control number of problems

# ==============================
# SELECT LIMITED PROBLEMS
# ==============================

problem_ids = list(problems.keys())[:NUM_PROBLEMS]
limited_problems = {k: problems[k] for k in problem_ids}

# ==============================
# RUN GENERATION
# ==============================

samples = generate_pass_k_samples(
    problems=limited_problems,
    model_name=model_name,
    k=pass_k
)

print("\nFinished running limited problems.")


Generating pass@1 samples:  33%|â–ˆâ–ˆâ–ˆâ–Ž      | 1/3 [00:01<00:03,  1.75s/it]

----------------------------------
Raw generated code:
```
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for i in range(len(numbers) - 1):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False
```

-------------------------------------
Cleaned generated code:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for i in range(len(numbers) - 1):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False

----------------------------------
Final completion after stripping function def:
    for i in range(len(numbers) - 1):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False



Generating pass@1 samples: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:04<00:00,  1.47s/it]

----------------------------------
Raw generated code:
```
def separate_paren_groups(paren_string):
    stack = []
    groups = []
    for char in paren_string:
        if char == "(":
            stack.append(char)
        elif char == ")":
            top = stack.pop()
            while top != "(":
                groups[-1] += top
                top = stack.pop()
    for group in stack:
        groups.append(group[1:-1])
    return groups
```

-------------------------------------
Cleaned generated code:
def separate_paren_groups(paren_string):
    stack = []
    groups = []
    for char in paren_string:
        if char == "(":
            stack.append(char)
        elif char == ")":
            top = stack.pop()
            while top != "(":
                groups[-1] += top
                top = stack.pop()
    for group in stack:
        groups.append(group[1:-1])
    return groups

----------------------------------
Final completion after stripping function def:
    stack = []





In [18]:
from tqdm import tqdm

model_name = "codellama:7b"
pass_k = 1   # ðŸ”¥ change this to 1, 2, 5, 10, etc.

output_dir = "BASIC CODE RESULTS"
os.makedirs(output_dir, exist_ok=True)

samples = generate_pass_k_samples(
    problems=problems,
    model_name=model_name,
    k=pass_k
)

output_file = os.path.join(
    output_dir,
    f"samples_BASIC_codellama_7b_pass@{pass_k}.jsonl"
)

write_jsonl(output_file, samples)

print(f"Generated samples saved to {output_file}")

Generating pass@1 samples: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [04:23<00:00,  1.61s/it]

Generated samples saved to BASIC CODE RESULTS/samples_BASIC_codellama_7b_pass@1.jsonl





In [19]:
! evaluate_functional_correctness "BASIC CODE RESULTS/samples_BASIC_codellama_7b_pass@{pass_k}.jsonl"

Reading samples...
164it [00:00, 27188.37it/s]
Running test suites...
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [00:04<00:00, 37.21it/s]
Writing results to BASIC CODE RESULTS/samples_BASIC_codellama_7b_pass@1.jsonl_results.jsonl...
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [00:00<00:00, 41083.79it/s]
{'pass@1': 0.3170731707317073}


In [None]:
def get_passed_results(file_path):
    """
    Reads a JSONL results file and extracts all entries where 'passed' is True.
    """
    passed_results = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            if data.get("passed", False):
                passed_results.append(data)
    return passed_results


def print_passed_results(file_path, limit=5):
    """
    Prints passed results (default: first 5).
    """
    passed = get_passed_results(file_path)
    print(f"\nTotal Passed Cases: {len(passed)}")

    for i, item in enumerate(passed[:limit]):
        print("\n" + "=" * 80)
        print(f"Task ID: {item['task_id']}")
        print("Completion:\n", item["completion"])
        print("=" * 80)


def print_failed_results(file_path, limit=5):
    """
    Prints failed results (default: first 5).
    """
    failed = get_failed_results(file_path)
    print(f"\nTotal Failed Cases: {len(failed)}")

    for i, item in enumerate(failed[:limit]):
        print("\n" + "=" * 80)
        print(f"Task ID: {item['task_id']}")
        print("Completion:\n", item["completion"])
        print("=" * 80)


In [None]:
results_file = "BASIC CODE RESULTS/samples_BASIC_codellama_7b_pass@1.jsonl_results.jsonl"

print_passed_results(results_file, limit=5)
print_failed_results(results_file, limit=5)



Task ID: HumanEval/1
Completion:
     result = []
    stack = []
    for char in paren_string:
        if char == "(":
            stack.append(char)
        elif char == ")":
            top = stack.pop()
            while top != "(":
                result.append(top)
                top = stack.pop()
    return result

Task ID: HumanEval/3
Completion:
      balance = 0
     for operation in operations:
         balance += operation
         if balance < 0:
             return True
     return False

Task ID: HumanEval/4
Completion:
     mean_val = mean(numbers)
    return sum(map(lambda x: abs(x - mean_val), numbers)) / len(numbers)

Task ID: HumanEval/5
Completion:
     return [delimeter if i < len(numbers) - 1 else n for i, n in enumerate(numbers)]

Task ID: HumanEval/6
Completion:
     result = []
    for group in paren_string.split():
        level = 0
        for char in group:
            if char == '(':
                level += 1
            elif char == ')':
               