In [8]:
from human_eval.data import read_problems, write_jsonl

problems = read_problems()
print("Number of HumanEval problems:", len(problems))

# Inspect one problem
example_task = "HumanEval/1"
print("Example task keys:", problems[example_task].keys())
print("\nPrompt:\n", problems[example_task]["prompt"])


Number of HumanEval problems: 164
Example task keys: dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test'])

Prompt:
 from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """



In [9]:
import re
import math
import time
import tempfile
import subprocess
import requests
from pathlib import Path
from typing import List, Dict
import sys
import json

from tqdm import tqdm


In [10]:
def get_failed_results(file_path):
    """
    Reads a JSONL results file and extracts all entries where 'passed' is False.
    """
    failed_results = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            if not data.get("passed", False):
                failed_results.append(data)
    return failed_results


def extract_clean_code(generated_text: str) -> str:
    """
    Extracts and cleans Python code from an AI-generated response.
    """
    code_blocks = re.findall(r"```(?:python)?\s*(.*?)\s*```", generated_text, re.DOTALL)
    if code_blocks:
        return max(code_blocks, key=len).strip()
    
    lines = generated_text.splitlines()
    code_started = False
    code_lines = []
    for line in lines:
        if not code_started and line.lstrip().startswith(('def ', 'class ', 'import ', 'from ')):
            code_started = True
        if code_started:
            code_lines.append(line)
    if code_lines:
        return "\n".join(code_lines).strip()
    
    return generated_text.strip()



In [None]:
import requests

OLLAMA_URL = "http://127.0.0.1:11434/api/generate"

def ollama_generate(prompt: str, model: str, temperature=0.1, num_predict=500) -> str:
    payload = {
        "model": model,
        "prompt": prompt,
        "temperature": temperature,
        "num_predict": num_predict,
        "stream": False
    }

    resp = requests.post(OLLAMA_URL, json=payload, timeout=None)
    resp.raise_for_status()

    return resp.json().get("response", "").strip()




def generate_SCoT(prompt: str, model: str) -> str:
    """
    One-shot SCoT generation (Figure 3 style)
    """

    system_msg = (
        "You are a Python problem-solving assistant.\n"
        "Generate a rough structured solving process in natural language.\n"
        "Do NOT write code.\n"
    )

    one_shot_example = """
        def first_Repeated_Char(str):
        \"\"\"
        Write a python function to find the first repeated
        character in a given string.
        \"\"\"
        Pass

        Please understand the requirement and write a rough solving
        process. It starts with a input-output structure. You
        should use three basic structures to build the solving
        process, including sequences, branches, and loops.
        The necessary details should be written in natural languages.

        Input: str: a string
        Output: ch: a repeated character in str
        1: for each character ch in str:
        2: if ch appears more than once in str:
        3: return ch
        4: return None
       """

    user_msg = f"""
            {one_shot_example}

            Now solve the following problem in the SAME way.

            {prompt}
            """

    full_prompt = system_msg + "\n" + user_msg

    return ollama_generate(
        prompt=full_prompt,
        model=model,
        temperature=0.1,
        num_predict=500
    )

def generate_one_completion_SCoT(prompt: str, model: str, print_scot=False) -> str:
    """
    One-shot code generation using SCoT (Figure 4 style)
    """

    scot = generate_SCoT(prompt, model)
    # print("=== SCoT ===")
    # print(scot)

    if print_scot:
        print("=== SCoT ===")
        print(scot)
        print()

    system_msg = (
        "You are a Python coding assistant.\n"
        "Write code based on the given solving process.\n"
        "Note that the solving process may contain errors.\n"
        "Output ONLY Python code.\n"
    )

    one_shot_code_example = """
        def first_Repeated_Char(str):
        \"\"\"
        Write a python function to find the first repeated
        character in a given string.
        Input: str: a string
        Output: ch: a repeated character in str
        1: for each character ch in str:
        2: if ch appears more than once in str:
        3: return ch
        4: return None

        Code for the above SCoT:
        \"\"\"
        
        h = {}
        for ch in str:
            if ch in h:
                return ch
            else:
                h[ch] = 0
        return None
        """

    user_msg = f"""
        {one_shot_code_example}

        Now generate code for the following .

        {prompt}

        Structured Solving Process:
        {scot}
        """

    full_prompt = system_msg + "\n" + user_msg

    raw = ollama_generate(
        prompt=full_prompt,
        model=model,
        temperature=0.1,
        num_predict=400
    )

    cleaned = extract_clean_code(raw)
    return cleaned


In [12]:
# ==============================

model_name = "codellama:7b"
pass_k = 1
NUM_PROBLEMS = 5   # ðŸ”¥ Change this to control number of problems

# ==============================
# SELECT LIMITED PROBLEMS
# ==============================

problem_ids = list(problems.keys())[:NUM_PROBLEMS]

# ==============================
# GENERATION LOOP
# ==============================

for task_id in tqdm(problem_ids, desc=f"SCoT pass@{pass_k} ({model_name})"):

    problem_prompt = problems[task_id]["prompt"]

    print("\n" + "=" * 80)
    print(f"TASK ID: {task_id}")

    output = generate_one_completion_SCoT(
        problem_prompt,
        model_name,
        print_scot=True
    )

    print("=" * 80)
    print("=== FINAL RETURNED CODE ===")
    print(output)
    print("=" * 80 + "\n")

SCoT pass@1 (codellama:7b):   0%|          | 0/5 [00:00<?, ?it/s]


TASK ID: HumanEval/0
=== SCoT ===
Input: numbers: a list of floats, threshold: a float
Output: bool

1. Initialize a variable "close" to False
2. Iterate over the list of numbers using a for loop
3. For each number in the list, calculate its distance from its neighbors (i.e., the previous and next numbers in the list) using the absolute difference between them. If any of these distances is less than or equal to the threshold, set "close" to True.
4. Return "close"



SCoT pass@1 (codellama:7b):  20%|â–ˆâ–ˆ        | 1/5 [00:01<00:07,  1.76s/it]

=== FINAL RETURNED CODE ===
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """Check if in given list of numbers, are any two numbers closer to each other than given threshold."""
    close = False
    for i in range(1, len(numbers)):
        distance = abs(numbers[i] - numbers[i-1])
        if distance <= threshold:
            close = True
            break
    return close


TASK ID: HumanEval/1
=== SCoT ===
Here's a rough structured solving process for the problem:

1. Define the input and output of the function. The input is a string containing multiple groups of nested parentheses, and the output is a list of strings where each element in the list represents a separate group of balanced parentheses.
2. Write a loop that iterates through each character in the input string. For each character, check if it is an open parenthesis. If it is, increment a counter variable to keep track of the number of open parentheses.
3. Continue iterating through the chara

SCoT pass@1 (codellama:7b):  40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 2/5 [00:09<00:15,  5.09s/it]

=== FINAL RETURNED CODE ===
def separate_paren_groups(paren_string):
    repeated_char = first_repeated_char(paren_string)
    if repeated_char is not None:
        return [repeated_char]
    else:
        return []


TASK ID: HumanEval/2
=== SCoT ===
Input: number: a positive floating point number
Output: decimals: the leftover part of the number, always smaller than 1

Solving Process:

1. Determine the integer part of the number by using math.floor(number). This will give us the largest integer smaller than the given number.
2. Subtract the integer part from the original number to get the decimals. For example, if the original number is 3.5 and we subtracted 3 (the integer part), the result would be 0.5 (the decimals).
3. Return the decimals.

Note: This solution assumes that the input number is a positive floating point number. If the input is not a positive number, an error will occur.



SCoT pass@1 (codellama:7b):  60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 3/5 [00:11<00:07,  3.89s/it]

=== FINAL RETURNED CODE ===
def truncate_number(number: float) -> float:
    """Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

    # Determine the integer part of the number
    integer_part = math.floor(number)

    # Subtract the integer part from the original number to get the decimals
    decimals = number - integer_part

    return decimals


TASK ID: HumanEval/3
=== SCoT ===
Input: operations: a list of integers representing deposit and withdrawal operations on a bank account
Output: bool: whether the balance of the account falls below zero at any point during the operations.

1: Initialize a variable balance to 0.
2: for each operation in the list of operations:
3: if the operation is a deposit:
4: increment balance by the amount of the deposit.
5: else

SCoT pass@1 (codellama:7b):  80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 4/5 [00:13<00:03,  3.12s/it]

=== FINAL RETURNED CODE ===
def below_zero(operations):
    balance = 0
    for operation in operations:
        if operation > 0:
            balance += operation
        else:
            balance -= operation
        if balance < 0:
            return True
    return False


TASK ID: HumanEval/4
=== SCoT ===
The rough structured solving process for the problem of finding the first repeated character in a string can be broken down as follows:

1. Define the input and output structures:
	* Input: str (a string)
	* Output: ch (the first repeated character in str, or None if no repeated characters are found)
2. Begin with a sequence of steps to iterate over each character in the input string:
	* For each character ch in str:
3. Check if the current character appears more than once in the input string:
	* If ch appears more than once in str:
4. Return the first repeated character that is found:
	* Return ch
5. If no repeated characters are found, return None:
	* Return None

Note that thi

SCoT pass@1 (codellama:7b): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:17<00:00,  3.43s/it]

=== FINAL RETURNED CODE ===
from typing import List

def mean_absolute_deviation(numbers: List[float]) -> float:
    """
    For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """

    # Calculate the mean of the input list
    mean = sum(numbers) / len(numbers)

    # Initialize a variable to store the sum of the absolute differences
    sum_of_abs_diffs = 0

    # Iterate over the input list
    for number in numbers:
        # Calculate the absolute difference between the current element and the mean
        abs_diff = abs(number - mean)
        # Add the absolute difference to the sum
        sum_of_abs_diffs += abs_diff

    # Divide the sum by the length of the input list to get the average absolute devi




In [15]:
from tqdm import tqdm
from human_eval.data import write_jsonl
import os

pass_k = 1  # ðŸ”¥ change this to 1, 2, 5, 10
model_name = "codellama:7b"

# ==============================
# CREATE SCOT RESULTS FOLDER
# ==============================

output_dir = "SCOT RESULTS"
os.makedirs(output_dir, exist_ok=True)

samples = []

for task_id in tqdm(problems, desc=f"Generating SCoT pass@{pass_k} ({model_name})"):
    prompt = problems[task_id]["prompt"]

    for _ in range(pass_k):
        samples.append({
            "task_id": task_id,
            "completion": generate_one_completion_SCoT(
                prompt,
                model_name
            )
        })

# ==============================
# SAVE FILE INSIDE FOLDER
# ==============================

output_file = os.path.join(
    output_dir,
    f"samples_codellama_SCoT_pass@{pass_k}.jsonl"
)

write_jsonl(output_file, samples)

print(f"Generated samples saved to {output_file}")

Generating SCoT pass@1 (codellama:7b): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [21:08<00:00,  7.74s/it]   

Generated samples saved to SCOT RESULTS/samples_codellama_SCoT_pass@1.jsonl





In [None]:
! evaluate_functional_correctness "SCOT RESULTS/samples_codellama_SCoT_pass@1.jsonl"

Reading samples...
164it [00:00, 6518.33it/s]
Running test suites...
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [00:01<00:00, 93.42it/s]
Writing results to SCOT RESULTS/samples_codellama_SCoT_pass@1.jsonl_results.jsonl...
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 164/164 [00:00<00:00, 17472.27it/s]
{'pass@1': 0.2804878048780488}


In [None]:
import json

def get_passed_results(file_path):
    """
    Reads a JSONL results file and extracts all entries where 'passed' is True.
    """
    passed_results = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            if data.get("passed", False):
                passed_results.append(data)
    return passed_results


def print_passed_results(file_path, limit=5):
    """
    Prints passed results (default: first 5).
    """
    passed = get_passed_results(file_path)
    print(f"\nTotal Passed Cases: {len(passed)}")

    for i, item in enumerate(passed[:limit]):
        print("\n" + "=" * 80)
        print(f"Task ID: {item['task_id']}")
        print("Completion:\n", item["completion"])
        print("=" * 80)


def print_failed_results(file_path, limit=5):
    """
    Prints failed results (default: first 5).
    """
    failed = get_failed_results(file_path)
    print(f"\nTotal Failed Cases: {len(failed)}")

    for i, item in enumerate(failed[:limit]):
        print("\n" + "=" * 80)
        print(f"Task ID: {item['task_id']}")
        print("Completion:\n", item["completion"])
        print("=" * 80)


In [2]:
results_file = "SCOT RESULTS/samples_codellama_SCoT_pass@1.jsonl"

print_passed_results(results_file, limit=5)
print_failed_results(results_file, limit=5)

NameError: name 'json' is not defined