# LLM Agent for Software Engineering

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8" # this is needed to get rid of weird colab locale error
# if you are still running into issues, please restart the runtime to initialize a new environment

In [None]:
# https://github.com/evalplus/evalplus
!pip install evalplus==0.2.0

In [None]:
!wget https://raw.githubusercontent.com/uiuc-cs598lmz-s25/hw5/main/buggy_humaneval.jsonl

In [None]:
import json

def grab_buggy_dataset():
    inference_dataset = []
    file = "buggy_humaneval.jsonl"
    with open(file, "r") as f:
        inference_dataset.extend([json.loads(x) for x in f.readlines()])
    print("Number of tasks: {}".format(len(inference_dataset)))
    return inference_dataset

buggy_humaneval = grab_buggy_dataset()
# feel free to play around the dataset for a bit

In [None]:
### Basic Tool Result Class ###
# Here we define a basic tool result class
import os
from abc import ABC, ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import Any

@dataclass(kw_only=True, frozen=True)
class ToolResult:
    """Represents the result of a tool execution."""
    output: str
    data: dict[str, Any] = None

    def __repr__(self):
        return self.output


class ToolError(ToolResult):
    """Raised when a tool encounters an error."""

    def __repr__(self):
        return f"Error: {self.output}"


### Gemini Model Utilities

In [None]:
from google import genai
from google.genai import types
client = genai.Client(api_key="YOUR_API_KEY")

In [None]:
import signal
import time

safety_settings = [ # google is afraid of pretty much everything i think.
    {
        "category": "HARM_CATEGORY_CIVIC_INTEGRITY",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]


def create_gemini_config(
    max_tokens: int,
    temperature: float = 1,
    batch_size: int = 1,
    tools: list = None,
):
    config = types.GenerateContentConfig(
        candidate_count=batch_size,
        max_output_tokens=max_tokens,
        temperature=temperature,
        safety_settings=safety_settings,
        tools=tools,
    )
    return config


def handler(signum, frame):
    # swallow signum and frame
    raise Exception("end of time")


def request_gemini_engine(chat_client, message, config):
    ret = None
    count = 0
    while ret is None:
        try:
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(100)
            ret = chat_client.send_message(message, config=config)
            s = ret.candidates  # check if response can be accessed.
            signal.alarm(0)
        except Exception as e:
            # NOTE this exception handling is needed since sometimes gemini will
            # refuse to answer due to safety reason (even if all blockers are set off)
            # instead we just simply catch this and then retry the response.
            # don't be alarmed if certain inputs take a long time to finish
            # eventually it should return a response.
            print("Error", type(e), e)
            
            ret = None  # reset
            signal.alarm(0)
            time.sleep(20)
    return ret

## Localization

In [None]:

print_buggy_location_tool_definition = {
    "name": "print_buggy_line",
    "description": """A tool that prints the buggy line in a given code snippet.""",
    "parameters": {
        "type": "object",
        "properties": {
            "line_number": {
                "description": "The line number of the buggy line in the code snippet (1-indexed).",
                "type": "integer",
            },
        },
        "required": ["line_number"],
    },
}



In [None]:
# In this assignment we implement a simple localization agent to locate the buggy line and report the localization accuracy

def run_localization_agent(buggy_code, client) -> int | None:
    """Run the localization agent to find the buggy line."""
    chat = client.chats.create(model="models/gemini-2.0-flash")
    # TODO: Implement the logic to construct the initial prompt with the buggy code.
    # To make it easier for the LLM to locate the buggy line, don't forget to add line numbers to the code.
    user_message = "..."
    config = create_gemini_config(
        max_tokens=512,
        temperature=1.0,
        batch_size=1,
        # we pass the tools through the generation config
        tools=[types.Tool(functionDeclarations=[print_buggy_location_tool_definition])],
    )
    response = request_gemini_engine(chat, user_message, config)

    if not response.function_calls:
        return None
    line_number = None
    for function_call in response.function_calls:
        func_name = function_call.name
        func_args = function_call.args

        # TODO: process response and parse the tool call into a line number
        ...


    # If the response does not contain a valid tool call, return None
    # If the response contains a valid tool call, return the predicted buggy line number (0-indexed)
    # 0-index means the first line of the code is line 0. For example, if the 5th line is buggy, the agent shall return an integer 4
    # This is consistent with the `buggy_humaneval.jsonl` dataset, where the `buggy_line` field is 0-indexed
    return line_number
    


In [None]:
### Test the Localization Agent on a single task ###
pred_line_number = run_localization_agent(
    buggy_code=buggy_humaneval[0]['buggy_code'],
    client=client
)
print("pred line number = ", pred_line_number)
print("gt line number = ", buggy_humaneval[0]['buggy_line'])

In [None]:
from tqdm import tqdm

def gemini_fault_localization(client, bug_dataset, workdir) -> tuple[float, float]:

    all_ids = []
    complete_ids = []
    pass_ids = []
  
    os.makedirs(workdir, exist_ok=True)
    with open(os.path.join(workdir, "buggy_line_predictions.jsonl"), "w") as f:
        f.write("")
    for bug in tqdm(bug_dataset):
        gt_line_number = bug['buggy_line']
        pred_line_number = run_localization_agent(
            buggy_code=bug['buggy_code'],
            client=client
        )
        with open(os.path.join(workdir, "buggy_line_predictions.jsonl"), "a") as f:
            f.write(json.dumps({
                "task_id": bug['task_id'],
                "gt_line_number": gt_line_number,
                "pred_line_number": pred_line_number,
            }) + "\n")
        
            if pred_line_number is not None:
                complete_ids.append(bug['task_id'])
            if pred_line_number is not None and pred_line_number == gt_line_number:
                pass_ids.append(bug['task_id'])
            all_ids.append(bug['task_id'])
    accuracy = len(pass_ids) / len(all_ids)
    complete_rate = len(complete_ids) / len(all_ids)
    return accuracy, complete_rate

# accuracy of gemini
gemini_acc_score, gemini_complete_rate = gemini_fault_localization(client, buggy_humaneval, "gemini_localization")
print(f"{gemini_acc_score = }, {gemini_complete_rate = }")

## Repair

In [None]:
### Definition of the TextEditorTool

# TODO: Implement the tool definition
text_editor_tool_definition = {
    "name": "text_editor",
    # Hint: you can write a long description for the tool to help the LLM understand how to use it
    "description": "tool description here",
    "parameters": {
        # Hint: valid type of input parameters are: "integer", "string", etc
        # Hint: don't forget to set the required parameters for the tool
        "type": "object",
        "properties": {
            "command": {
                "description": "The command to execute. Allowed options are: `str_replace`, `line_replace`",
                "type": "string",
                # For command we can use enum to restrict the command to a set of valid options
                "enum": ["str_replace", "line_replace"],
            },
        ...
    }
}


def run_text_edit_command(code, command: str, line_number: int = None, old_str: str = None, new_str: str = None) -> str:
    """Run the specified command on the code, and return the edited code."""
    # Hint: you may raise ValueError if missing required parameters or invalid command
    if command == "str_replace":
        # TODO: Implement string replacement
        ...
    elif command == "line_replace":
        # TODO: Implement line replacement
        ...
    else:
        raise ValueError("Invalid command. Allowed options are: `str_replace`, `line_replace`.")

def call_text_editor(code, command: str, line_number: int = None, old_str: str = None, new_str: str = None) -> ToolResult:
    """Call the text editor tool to edit the code."""
    try:
        new_code = run_text_edit_command(code, command, line_number, old_str, new_str)
        # add the edited code to the data field of the ToolResult
        data = {"new_code": new_code,}
        return ToolResult(output=f"The code has been successfully edited. Below is the new code:\n```python\n{new_code}\n```",
                          data=data)
    except Exception as e:
        return ToolError(output=f"Error: {str(e)}")



In [None]:
def run_repair_agent(buggy_code: str, client: genai.Client) -> str:
    """Run the repair agent to fix the buggy code."""
    chat = client.chats.create(model="models/gemini-2.0-flash")

    # TODO: Implement the logic to construct the initial prompt with the buggy code.
    user_message = "..."

    config = create_gemini_config(
        max_tokens=512,
        temperature=1.0,
        batch_size=1,
        tools=[types.Tool(functionDeclarations=[text_editor_tool_definition])],
    )
    response = request_gemini_engine(chat, user_message, config)

    # Return the original buggy code if no function calls are made
    if not response.function_calls:
        return buggy_code
    
    fixed_code = buggy_code
    for function_call in response.function_calls:
        func_name = function_call.name
        func_args = function_call.args

        # TODO: process response and execute the tool call
        # Hint: you can use `tool_result.data["new_code"]` to obtain the edited code


    return fixed_code

In [None]:
#### Test the Repair Agent on a single task ###
import difflib
def get_diff(old_code: str, new_code: str) -> str:
    """Compute the difference between two code snippets."""
    diff = difflib.unified_diff(old_code.splitlines(), new_code.splitlines(), fromfile="old", tofile="new", lineterm="")
    return "\n".join(diff)

fixed_code = run_repair_agent(
    buggy_code=buggy_humaneval[0]['buggy_code'],
    client=client
)
print("========= Fixed Code =========")
print(fixed_code)
print("========= DIFF =========")
print(get_diff(buggy_humaneval[0]['buggy_code'], fixed_code))


In [None]:
from tqdm import tqdm

def gemini_repair(client, bug_dataset, workdir) -> float:

    for bug in tqdm(bug_dataset):
        
        # run the repair agent
        fixed_code = run_repair_agent(
            buggy_code=bug['buggy_code'],
            client=client
        )

        name = bug["task_id"].replace("/", "_")
        os.makedirs(os.path.join(workdir, name), exist_ok=True)
        with open(os.path.join(workdir, name, '0.py'), 'w') as f:
            f.write(fixed_code)

gemini_repair(client, buggy_humaneval, "gemini_repair")


In [None]:
!yes Y | evalplus.evaluate --dataset humaneval --samples gemini_repair --i-just-wanna-run

## Repair with test execution feedback

In [None]:
from evalplus.data import get_human_eval_plus
human_eval_plus = get_human_eval_plus()

In [None]:
### Definition of the RunTestTool
run_test_tool_definition = {
    "name": "run_test",
    "description": """A tool that executes a specified function with given test inputs and returns the execution result. 
This tool is useful for validating the correctness of a function by running it with specific inputs and observing the output.
""".strip(),
    "parameters": {
        "type": "object",
        "properties": {
            "function_invocation": {
                "description": "A function invocation string. For example: `add(1, 2)`.",
                "type": "string",
            },
        },
        "required": ["function_invocation"],
    },
}

def _execute(function_definition_code: str, function_invocation: str):
    """Execute the function with the given code and function invocation."""
    exec_globals = {}
    exec_locals = {}
    exec(function_definition_code, exec_globals, exec_locals)
    result = eval(f"{function_invocation}", exec_globals, exec_locals)
    return result

def call_run_test(code: str, function_invocation: str, canonical_solution: str = None) -> ToolResult:
    """Run the tests on the code snippet."""
    expected_output_message = ""
    if canonical_solution:
        try:
            gold_result = _execute(canonical_solution, function_invocation)
            expected_output_message = f"\n\nExpected output:\n```\n{gold_result}\n```"
        except Exception as e:
            return ToolError(output=f"Error: The test input is not valid.\n{str(e)}")
    try:
        result = _execute(code, function_invocation)
        return ToolResult(output=f"Execution output:\n```\n{result}\n```" + expected_output_message,
                            data={"result": result})
    except Exception as e:
        return ToolError(output=f"Error: {str(e)}")


In [None]:
# Utility function to dump message to JSON
from json import JSONEncoder


class MessageEncoder(JSONEncoder):
        def default(self, o):
            return o.__dict__

In [None]:

def run_repair_with_execution_feedback(client, buggy_code, canonical_solution: str=None) -> str:
    chat = client.chats.create(model="models/gemini-2.0-flash")
    # Below is an example of how to construct the initial user message
    # You can modify it to suit your needs
    prompt_template = """
Please help me fix bugs in the following code snippet with the `text_editor` tool.

After fixing the code, you can come up with your own test inputs and test the code with the `run_test` tool.

* If the code does not have line numbers, please use the `str_replace` command and avoid using the `line_replace` command.
* You can test at most 3 times.

```python
{buggy_code}
```
""".strip()
    user_message = prompt_template.format(buggy_code=buggy_code)

    config = create_gemini_config(
        max_tokens=512,
        temperature=1.0,
        batch_size=1,
        tools=[
            types.Tool(functionDeclarations=[text_editor_tool_definition, run_test_tool_definition]),
        ],
    )
    response = request_gemini_engine(chat, user_message, config)

    current_code = buggy_code

    # Set a maximum number of iterations to prevent infinite loops
    for _ in range(20):
        # If the model does not return any function calls, we can assume that it has finished
        # and we can break the loop
        if not response.function_calls:
            break
        # Now we need to process the function calls
        tool_results = []
        for function_call in response.function_calls:
            func_name = function_call.name
            func_args = function_call.args

            # TODO: process response and execute the tool call
            # Hint 1: pass the `current_code` instead of the original buggy code when executing tools
            # Hint 2: don't forget to update the `current_code` when the code is modified
        
            # tool_result = ...

            if isinstance(tool_result, ToolError):
                tool_results.append({
                    "resonse": {"error": tool_result.output,},
                    "name": func_name,
                })
            else:
                tool_results.append({
                    "resonse": {"output": tool_result.output,},
                    "name": func_name,
                })
        # Break the loop if no valid tool results are returned
        if not tool_results:
            break
        # Build the tool responses.
        # We need to construct a FunctionResponse object for each tool result.
        # Check documentation: https://googleapis.github.io/python-genai/genai.html#genai.types.FunctionResponse
        # The response field should be a dictionary, use “output” key to specify function output and “error” key to specify error details (if any).
        tool_responses = [
            types.Part(
                function_response=types.FunctionResponse(
                    name=tool_result["name"], response=tool_result["resonse"])
            )
            for tool_result in tool_results
        ]
        # Send the response parts back to the model
        response = request_gemini_engine(chat, tool_responses, config)
    return current_code, chat.get_history()

In [None]:
### Example usage of the repair agent with execution feedback ###
task_id = buggy_humaneval[0]['task_id']
canonical_solution = human_eval_plus[task_id]["prompt"] + human_eval_plus[task_id]["canonical_solution"]
buggy_code=buggy_humaneval[0]['buggy_code']

fixed_code, messages = run_repair_with_execution_feedback(
    client=client,
    buggy_code=buggy_code,
    canonical_solution=canonical_solution,
)
print("========= Fixed Code =========")
print(fixed_code)
print("========= DIFF =========")
print(get_diff(buggy_humaneval[0]['buggy_code'], fixed_code))


In [None]:
from tqdm import tqdm

def gemini_repair(client, bug_dataset, workdir, restart=True) -> float:

    for bug in tqdm(bug_dataset):
        name = bug["task_id"].replace("/", "_")
        if not restart:
            if os.path.exists(os.path.join(workdir, name, '0.py')):
                continue
        task_id = bug['task_id']
        canonical_solution = human_eval_plus[task_id]["prompt"] + human_eval_plus[task_id]["canonical_solution"]
        
        # run the repair agent
        fixed_code, messages = run_repair_with_execution_feedback(
            client=client,
            buggy_code=bug['buggy_code'],
            canonical_solution=canonical_solution,
        )
        name = bug["task_id"].replace("/", "_")
        os.makedirs(os.path.join(workdir, name), exist_ok=True)
        with open(os.path.join(workdir, name, '0.py'), 'w') as f:
            f.write(fixed_code)
        with open(os.path.join(workdir, name, 'traj.json'), 'w') as f:
            json.dump(messages, f, indent=2, cls=MessageEncoder)


gemini_repair(client, buggy_humaneval, "gemini_repair_with_execution_feedback")


In [None]:
!yes Y | evalplus.evaluate --dataset humaneval --samples gemini_repair_with_execution_feedback --i-just-wanna-run