In [23]:
# Create docker container for agent environment
import docker
import os 
import dotenv

dotenv.load_dotenv()

# Create a Docker client from the environment
client = docker.from_env()

work_dir = "/git-agent"

# Pull the image (Ubuntu and Git pre-installed)
# Use `ubuntu` base and install git during container creation
container = client.containers.run(
    image="ubuntu:latest",              # Base image
    command="sleep infinity",           # Keep container running
    detach=True,
    tty=True,                           # Enables command execution like a shell
    stdin_open=True,
    working_dir=work_dir
)

# Install git inside the container
exit_code, output = container.exec_run("bash -c 'apt-get update && apt-get install -y git'")
if exit_code != 0:
    _, stderr = output
    container.stop()
    container.remove()
    raise Exception(f"Failed to install git: {stderr.decode()}")

print(f"Successfully installed git on container: {container.id}")


Successfully installed git on container: e190cfa29741486b0f76f7c8d1582da790117fca2a36ae5f7594fee8d83673c6


In [24]:
import pexpect
import re

# Spawn a persistent pexpect handle to the container. Emulating a user interacting with the shell
def spawn_handle():
    child = pexpect.spawn(f"docker exec -it {container.id[:12]} bash", encoding='utf-8')
    child.expect(r'[$#]')  
    return child

# Common shell prompt noise patterns 
patterns = [
    re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])'), # ansi escape  
    re.compile(r'^[\x07]+$'),                           # BEL
    re.compile(r'^(root@|[\w.-]+@[\w.-]+:).*'),         # shell prompt
    re.compile(r'^0;.*'),                               # terminal title sequence
    re.compile(r'^\s*$'),                               # empty/whitespace lines
]

def clean_shell_output(raw: str, command: str = "") -> str:
    # Remove echoed command
    if command and raw.startswith(command):
        raw = raw[len(command):].lstrip()

    # Remove ANSI sequences first (acts on full string, not line-by-line)
    raw = patterns[0].sub('', raw)  # ANSI pattern is first

    # Then split and remove garbage lines
    lines = raw.strip().splitlines()
    result = []
    for line in lines:
        if any(p.match(line.strip()) for p in patterns[1:]):
            continue
        result.append(line.strip())
    return "\n".join(result)

handle = spawn_handle()

# Sanity check
# handle.sendline("touch sanity-test.txt")
# handle.expect(r'[$#]')

# handle.sendline("ls")
# handle.expect(r'[$#]')
# stdout = clean_shell_output(handle.before.strip(), command="ls")
# print(stdout)

# # git repo
# handle.sendline("mkdir -p sanity-repo2")
# handle.expect(r'[$#]')

# handle.sendline("cd sanity-repo2")
# handle.expect(r'[$#]')

# handle.sendline("git init")
# handle.expect(r'[$#]')
# stdout = clean_shell_output(handle.before.strip(), command="git init")
# print(stdout)

# handle.sendline("cd ../")
# handle.expect(r'[$#]')

# handle.sendline("pwd")
# handle.expect(r'[$#]')
# stdout = clean_shell_output(handle.before.strip(), command="pwd")
# print(stdout)


In [25]:
# Define tools
def parse_command(cmd: str) -> bool:
    allowed_unix = ["ls", "pwd", "cat", "mkdir", "cd", "touch", "rm", "cp", "mv"]
    allowed_git = ["git"]  # Allow git with any subcommand
    allowed_commands = allowed_unix + allowed_git
    
    cmd_parts = cmd.split()
    if not cmd_parts:
        return False
    
    first_cmd = cmd_parts[0]
    
    # Handle git commands specially
    if first_cmd == "git":
        return True
    
    # Handle compound commands (cd dir && command)
    if "&&" in cmd:
        # Check each part of compound command
        parts = cmd.split("&&")
        return all(parse_command(part.strip()) for part in parts)
    
    return first_cmd in allowed_commands

def execute_command(cmd: str) -> str:
    if not parse_command(cmd):
        return f"Error: Forbidden command: {cmd}"
    
    try:
        handle.sendline(cmd)
        handle.expect(r'[$#]', timeout=10)
        output = clean_shell_output(handle.before.strip(), command=cmd)
        
        # Provide better feedback for empty outputs
        if not output:
            # Check if this was a command that typically produces no output on success
            silent_commands = ["mkdir", "cd", "touch", "rm", "cp", "mv"]
            if any(cmd.startswith(sc) for sc in silent_commands):
                return f"Command executed successfully (no output expected)"
            else:
                return "Command executed (no output)"
        
        return output
        
    except Exception as e:
        return f"Error executing command: {str(e)}"

def call_tool(tool: str, cmd: str) -> str:
    if tool == "execute_command":
        return execute_command(cmd)
    else:
        return f"Error Invalid tool: {tool}"

# Test command
print(call_tool("execute_command", "pwd"))
print(call_tool("execute_command", "invalid command"))

# Duplicate for system prompt
tool_descriptions = """
    execute_command(cmd: str) -> str:
        Executes a command inside the Docker container using bash. Use this tool to interact with a stateful terminal.
        This tool will behave as if a user was interacting with the shell.
    
        IMPORTANT BEHAVIOR:
        - Commands like `mkdir`, `cd`, `touch` return "Command executed successfully (no output expected)" on success
        - Commands like `ls -a` in empty directories return "Command executed (no output)" 
        - The shell state persists between calls - directory changes, file creations, etc. are maintained
        
        Example workflow for creating a directory:
            execute_command("pwd")  # Check current location
            execute_command("mkdir test")  # Create directory
            execute_command("ls -a")  # Verify directory was created
            execute_command("cd test")  # Change to directory
            execute_command("pwd")  # Verify location change
            
        Args:
            cmd (str): The command to execute (e.g., 'ls -a', 'pwd', 'mkdir test', 'git init').

        Returns:
            str: The output of the command, success message, or error message if the command fails.
    """ 

/git-agent
Error: Forbidden command: invalid command


In [21]:
# Define agent
from openai import OpenAI
from pydantic import BaseModel
import instructor


system_prompt = f"""
You are a git agent. You will be given tasks to complete, like initializing a git repo.

You have access to a persistent shell handle to the Docker container. You will interact with this like a user would with a terminal.

You have access to the following tools to solve the task:
{tool_descriptions}

Tools can be called by writing a JSON command inside <tool> tags with:
- "name": the name of the tool to use
- "args": the arguments for the tool

Example usage:
<tool>
{{"name": "execute_command", "args": "ls"}}
</tool>

CRITICAL WORKFLOW RULES:
1. **ALWAYS start each task by checking your current working directory with `pwd`**
2. **VERIFY each command's success before proceeding** - use `ls -a` to confirm directory creation, file existence, etc.
3. **NEVER repeat a command that just failed** - analyze the error and try a different approach
4. **Track your progress** - remember what you've already accomplished successfully
5. **Use compound commands carefully** - `cd directory && command` only works if directory exists

SYSTEMATIC APPROACH:
For repository creation tasks:
1. Check current directory with `pwd`
2. Check if target directory already exists with `ls -a`
3. Create directory if needed, then verify creation
4. Navigate to directory and verify location
5. Initialize git repository
6. Verify git repository creation with `ls -a` (look for .git folder)

IMPORTANT COMMANDS:
- `pwd` - Check current working directory (use at start of each task)
- `ls -a` - List all files including hidden ones (use to verify actions)
- `cd directory` - Change directory (verify with `pwd` afterward)
- `ls -la` - Detailed listing to troubleshoot issues

ERROR HANDLING:
- If a command fails, analyze the error message
- Use `ls -la` to understand the current state
- Don't repeat failed commands - adapt your approach
- If directory "already exists", verify it's actually there with `ls -a`

You may call one tool call per turn, for up to 15 turns before giving you final answer.

TASK COMPLETION RULES:
- **ONLY provide a final answer when the ENTIRE task is successfully completed and verified**
- **Do NOT provide intermediate answers like "Let's check the directory" - just make the tool call**
- **If you need to perform ANY action, make a tool call instead of providing an answer**
- **Final answers should summarize what was accomplished, not what you're about to do**

DECISION LOGIC:
- Need to perform an action? → Make a tool call
- Task completely finished and verified? → Provide final answer
- Describing what you'll do next? → Make the tool call instead

EXAMPLES:
❌ WRONG - Don't provide answer: "Let's check the current directory"
✅ CORRECT - Make tool call: {{"name": "execute_command", "args": "pwd"}}

❌ WRONG - Don't provide answer: "I need to create the git-test-repo directory"
✅ CORRECT - Make tool call: {{"name": "execute_command", "args": "mkdir git-test-repo"}}

✅ CORRECT - Final answer: "Successfully created and initialized git repository 'git-test-repo'. The repository is located at /git-agent/git-test-repo and contains a .git directory confirming proper initialization."

In each turn you should respond in the following format:

<think>
[your thoughts here - include what you've verified so far and whether the task is complete]
</think>
<tool>
JSON with the following fields (ONLY if you need to perform an action):
- name: The name of the tool to call
- args: The arguments to pass to the tool
</tool>
<answer>
[ONLY provide this when the ENTIRE task is complete - summarize what was accomplished]
</answer>

IMPORTANT: You MUST provide either a tool call OR an answer, but not both. If you need to perform an action, only provide the tool call and leave the answer empty.

After concluding your message with a tool call,
you will then see the tool's output inside <result> tags as a new message. \
You may call tools multiple times if needed. \
The state of tools is persistent between turns.
"""

class ToolCall(BaseModel):
    tool: str
    args: str

class AgentResponse(BaseModel):
    think: str
    tool_call: ToolCall | None = None  # Make tool_call optional
    answer: str = ""  # Default to empty string


oai = OpenAI()
oai_instructor = instructor.from_openai(oai)

In [26]:
# Agent loop
repo_name = "git-test-repo"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Create a new local repository called {repo_name}, create a new branch called 'test', create a new file called 'test.txt', and commit it with the message `hello from agent`"}
]

final_answer = ""

turns = 0
while turns < 15:
    turns += 1
    retries = 0 
    while retries < 5:
        try:
            response = oai_instructor.chat.completions.create(
                model="gpt-4.1",
                messages=messages, # type: ignore
                response_model=AgentResponse
            )
            maybe_think = response.think
            maybe_tool_call = response.tool_call
            maybe_answer = response.answer
            if (maybe_tool_call and len(maybe_tool_call.tool) > 0) or len(maybe_answer) > 0:
                break 
        except Exception as e:
            print(f"Error: {e}")
            retries += 1
    print("=== Turn", turns, "===")
    if len(maybe_think) > 0: # type: ignore
        thinking = maybe_think.strip()
        print(f"Thinking: {thinking}")
    
    # Check if agent provided a final answer (task complete)
    if len(maybe_answer) > 0: # type: ignore
        final_answer = maybe_answer
        break
    
    # Check if agent wants to make a tool call
    if maybe_tool_call and len(maybe_tool_call.tool) > 0: # type: ignore
        tool_call = maybe_tool_call.tool
        tool_call_args = maybe_tool_call.args
        tool_result = call_tool(tool_call, tool_call_args)
        print(f"Tool call: {tool_call}")
        print(f"Args: {tool_call_args}")
        print(f"Tool result: {tool_result}")
        messages.append({"role": "user", "content": f"<result>{tool_result}</result>"})
    else:
        print("Error: No tool call or answer found")
        break

if len(final_answer) > 0:
    print(f"Final answer: {final_answer}")



=== Turn 1 ===
Thinking: Start by checking the current working directory to prepare for repo creation.
Tool call: execute_command
Args: pwd
Tool result: /git-agent
=== Turn 2 ===
Thinking: I've confirmed the current working directory is /git-agent. Next, I need to check if 'git-test-repo' already exists here by listing all files and folders.
Tool call: execute_command
Args: ls -a
Tool result: .  ..  hello
=== Turn 3 ===
Thinking: I've checked my current directory and contents. Next, I need to create the 'git-test-repo' directory and verify its creation before proceeding.
Tool call: execute_command
Args: mkdir git-test-repo
Tool result: Command executed successfully (no output expected)
=== Turn 4 ===
Thinking: I've created the directory 'git-test-repo'. Now, I need to verify the directory was created successfully.
Tool call: execute_command
Args: ls -a
Tool result: .  ..  git-test-repo  hello
=== Turn 5 ===
Thinking: I've created the directory 'git-test-repo'. Next, I need to enter tha

In [111]:
# Verify results
execute_command("cd ~/. && cd ../")
execute_command("cd git-agent")
print(execute_command("ls"))
print(execute_command(f"ls -a {repo_name}"))

git-test-repo-v2  git-test-repo-v3
.  ..  .git
