# RLM Log Analysis Functions

This notebook provides utility functions to extract key data from RLM log files:
- **Final answer**: The agent's concluding response
- **Code blocks**: All code executed during the session
- **RLM calls**: Sub-LLM calls made via `llm_query()` / `llm_query_batched()`

In [25]:
import sys
import importlib


sys.path.append('/home/winnieyangwn/rlm/analysis')
import rlm_log_utils
importlib.reload(rlm_log_utils)
from rlm_log_utils import *

## Usage Example

Load the log file and extract key information:

# Load

In [26]:

LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_codebase_513_2026-02-03_19-27-40_f8640a6a.jsonl"
# Load the log - first entry is metadata, rest are iterations
entries = load_rlm_log(LOG_PATH)
metadata = entries[0]
iterations = entries[1:]

print(f"Loaded {len(iterations)} iterations")

Loaded 4 iterations


# Metadata

In [27]:
# View metadata
print("=== METADATA ===")
for k, v in metadata.items():
    if k != "backend_kwargs":
        print(f"{k}: {v}")

=== METADATA ===
type: metadata
timestamp: 2026-02-03T19:27:40.778536
root_model: gpt-5
max_depth: 2
max_iterations: 100
backend: azure_openai
environment_type: local
environment_kwargs: {'setup_code': '\nimport pandas as pd\nfrom pathlib import Path\n\n# Load rollout data as DataFrame\nrollout_df = pd.read_json(\'/checkpoint/maui_sft/winnieyangwn/amaia_dumps/513/trajectories/513_metadata.jsonl\', lines=True)\n\n# Load codebase into dict\ncodebase = {}\ncodebase_root = Path(\'/checkpoint/agentic-models/winnieyangwn/amaia_dumps/503/code/2026_02_02_00_55_44\')\nfor ext in [\'.py\', \'.md\', \'.yaml\']:\n    for path in codebase_root.rglob(f\'*{ext}\'):\n        try:\n            rel_path = str(path.relative_to(codebase_root))\n            codebase[rel_path] = path.read_text(errors=\'ignore\')\n        except Exception:\n            pass  # Skip unreadable files\n\n# Load config YAML\nconfig_yaml = \'\'\'# gpt5 after rate limit fix + using jupyter eval as baseline\n# python -m launchers.s

In [28]:

# Compare with timestamp-based runtime
runtime = get_total_runtime(entries)
print(f"Timestamp-based runtime: {runtime.total_seconds():.2f}s")

Timestamp-based runtime: 162.05s


# Final Answer

In [29]:
# Get the final answer
final_answer = get_final_answer(iterations)
print("=== FINAL ANSWER ===")
print(final_answer if final_answer else "No final answer found")
# print(f"\n(Total length: {len(final_answer) if final_answer else 0} chars)")

=== FINAL ANSWER ===
Top 5 most common evaluation error messages among invalid submissions (normalized by first line):
1. SystemExit: 2  -> count=1669
2. Validation error: Submission invalid! The attempt to grade the submission has resulted in the following error message:  -> count=83
3. <<EMPTY>>  -> count=23
4. ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.  -> count=16
5. RuntimeError: Encountered NaN.  -> count=11

For reference, the top 5 exact messages (first line shown) are:
1. SystemExit: 2  -> count=1669
2. <<EMPTY>>  -> count=23
3. SystemExit: 1  -> count=4
4. Validation error: Submission invalid! The attempt to grade the submission has resulted in the following error message:  -> count=2
5. Validation error: Submission invalid! The attempt to grade the submission has resulted in the following error message:  -> count=2

Most frequent error: 'SystemExit: 2'.
Representative sampl

# Iterations

## Iteration # 0

In [35]:
iteration_id = 0
iteration = iterations[iteration_id] if iterations else None

if iteration:
    print(f"=================  Iteration {iteration_id} (#{iteration.get('iteration', 'N/A')}) =====================")
    print(f"Keys in  iteration {iteration_id} : {list(iteration.keys())}")
    

    # Check LLM response
    print(f"\n================= LLM Response in Iteration {iteration_id} =====================")
    response = iteration.get("response", "")
    print(f"Response:\n...{response}" )


    # Check code blocks in iteration
    code_blocks = iteration.get("code_blocks", [])
    print(f"\nNumber of code blocks: {len(code_blocks)}")
    
    for i, block in enumerate(code_blocks):
        print(f"\n======================================== Code Block {i} =======================================")
        # code = block.get("code", "")
        # print(f"\n----------------------- Code Block {i} Code ------------------")

        # print(f"Code: {code}")
        
        result = block.get("result", {})
        stdout = result.get("stdout", "")
        stderr = result.get("stderr", "")
        
        if stdout:
            print(f"\n----------------------- Code Block {i} Stdout ------------------")

            print(f"\nStdout:\n{stdout}" )
        if stderr:
            print(f"\n----------------------- Code Block {i} Stderr ------------------")
            print(f"\nStderr:\n{stderr}")
    

else:
    print("No iterations found!")

Keys in  iteration 0 : ['type', 'iteration', 'timestamp', 'prompt', 'response', 'code_blocks', 'final_answer', 'iteration_time']

Response:
...```repl
# Basic overview
print("rollout_df shape:", rollout_df.shape)
print("columns:", rollout_df.columns.tolist())
print("valid_submission value counts:\n", rollout_df["valid_submission"].value_counts(dropna=False))

# How many invalid submissions and sample rows
invalid_mask = ~rollout_df["valid_submission"]
print("Invalid submissions:", invalid_mask.sum())

# Peek at eval_error_output for invalid ones
sample_errors = rollout_df.loc[invalid_mask, "eval_error_output"].dropna().head(20).tolist()
print("\nSample eval_error_output (first 20 non-null among invalid):\n")
for i, e in enumerate(sample_errors):
    print(f"--- {i} ---\n{str(e)[:1000]}")  # truncate display

# Let's get counts of exact eval_error_output strings among invalid (non-null)
error_counts = rollout_df.loc[invalid_mask, "eval_error_output"].fillna("<<NA>>").value_counts()
prin

### Investigation: IndexError: list index out of range

The `IndexError` occurred in the agent's `first_line()` function:

```python
def first_line(s):
    if not isinstance(s, str):
        return "<<NA>>"
    return s.strip().splitlines()[0][:500]  # BUG HERE
```

**The bug**: When `s` is an **empty string** `""`:
1. `isinstance("", str)` → `True` (passes the guard)
2. `"".strip()` → `""`
3. `"".splitlines()` → `[]` (empty list!)
4. `[][0]` → **`IndexError: list index out of range`**

Looking at the "Top 20 exact error messages" output, the **second entry** has count `23` and is mostly whitespace/empty. When `.strip()` is applied, it becomes `""`, and `.splitlines()` returns `[]`.

**The fix** would be:
```python
def first_line(s):
    if not isinstance(s, str):
        return "<<NA>>"
    lines = s.strip().splitlines()
    return lines[0][:500] if lines else "<<NA>>"
```

The crash happened at:
```python
first_line_counts = rollout_df.loc[invalid_mask, "eval_error_output"].apply(first_line).value_counts()
```

Everything printed before this line executed successfully (the "Top 20 exact error messages" output), but once Python hit the exception, execution stopped immediately—so all subsequent analysis (normalized errors, codebase searches, etc.) never ran.

## Iteration #1

In [23]:
iteration_id = 1
iteration = iterations[iteration_id] if iterations else None

if iteration:
    print(f"=================  Iteration {iteration_id} (#{iteration.get('iteration', 'N/A')}) =====================")
    print(f"Keys in  iteration {iteration_id} : {list(iteration.keys())}")
    

    # Check LLM response
    print(f"\n================= LLM Response in Iteration {iteration_id} =====================")
    response = iteration.get("response", "")
    print(f"Response:\n...{response}" )


    # Check code blocks in iteration
    code_blocks = iteration.get("code_blocks", [])
    print(f"\nNumber of code blocks: {len(code_blocks)}")
    
    for i, block in enumerate(code_blocks):
        print(f"\n======================================== Code Block {i} =======================================")
        code = block.get("code", "")
        print(f"\n----------------------- Code Block {i} Code ------------------")

        print(f"Code: {code}")
        
        result = block.get("result", {})
        stdout = result.get("stdout", "")
        stderr = result.get("stderr", "")
        
        if stdout:
            print(f"\n----------------------- Code Block {i} Stdout ------------------")

            print(f"\nStdout:\n{stdout}" )
        if stderr:
            print(f"\n----------------------- Code Block {i} Stderr ------------------")
            print(f"\nStderr:\n{stderr}")
    

else:
    print("No iterations found!")

Keys in  iteration 1 : ['type', 'iteration', 'timestamp', 'prompt', 'response', 'code_blocks', 'final_answer', 'iteration_time']

Response:
...```repl
# 1) Compute top 5 most common evaluation error messages among invalid submissions

invalid_mask = ~rollout_df["valid_submission"]
invalid = rollout_df.loc[invalid_mask, "eval_error_output"]

# Exact message counts
exact_counts = invalid.fillna("<<NA>>").value_counts()
print("Top 5 exact error messages with counts:\n")
for i, (msg, cnt) in enumerate(exact_counts.head(5).items(), 1):
    first_line = (msg or "").splitlines()[0] if isinstance(msg, str) else str(msg)
    print(f"{i}. {repr(first_line)}  -> count={cnt}")

# First-line normalization to make messages human-friendly
def first_line(s):
    if not isinstance(s, str) or not s.strip():
        return "<<NA>>"
    return s.strip().splitlines()[0][:300]

first_counts = invalid.apply(first_line).value_counts()
print("\nTop 10 first-line normalized messages with counts:\n")
for i, (msg

### Investigation: Why only 2 lines in stdout?

The code crashed early with an `IndexError: list index out of range`.

When Python encounters an unhandled exception, execution **stops immediately**. The code only printed:
1. `"Top 5 exact error messages with counts:\n"`
2. `"1. 'SystemExit: 2'  -> count=1669"`

Then it crashed on the **second iteration** of the loop at this line:

```python
first_line = (msg or "").splitlines()[0] if isinstance(msg, str) else str(msg)
```

**The bug:** If `msg` is an empty string `""`, then `"".splitlines()` returns an **empty list `[]`**, and accessing `[0]` on an empty list raises `IndexError`.

So the agent's code never got past the first `for` loop — all subsequent print statements (the "Top 10 first-line normalized messages", the grep results, file contents, etc.) were never executed because the exception terminated the script.

#

# Iteration #2

In [33]:
iteration_id = 2
iteration = iterations[iteration_id] if iterations else None

if iteration:
    print(f"=================  Iteration {iteration_id} (#{iteration.get('iteration', 'N/A')}) =====================")
    print(f"Keys in  iteration {iteration_id} : {list(iteration.keys())}")
    

    # Check LLM response
    print(f"\n================= LLM Response in Iteration {iteration_id} =====================")
    response = iteration.get("response", "")
    print(f"Response:\n...{response}" )


    # Check code blocks in iteration
    code_blocks = iteration.get("code_blocks", [])
    print(f"\nNumber of code blocks: {len(code_blocks)}")
    
    for i, block in enumerate(code_blocks):
        print(f"\n======================================== Code Block {i} =======================================")
        # code = block.get("code", "")
        # print(f"\n----------------------- Code Block {i} Code ------------------")

        # print(f"Code: {code}")
        
        # result = block.get("result", {})
        # stdout = result.get("stdout", "")
        # stderr = result.get("stderr", "")
        
        if stdout:
            print(f"\n----------------------- Code Block {i} Stdout ------------------")

            print(f"\nStdout:\n{stdout}" )
        if stderr:
            print(f"\n----------------------- Code Block {i} Stderr ------------------")
            print(f"\nStderr:\n{stderr}")
    

else:
    print("No iterations found!")

Keys in  iteration 2 : ['type', 'iteration', 'timestamp', 'prompt', 'response', 'code_blocks', 'final_answer', 'iteration_time']

Response:
...```repl
# Let's robustly compute the top 5 most common evaluation error messages among invalid submissions.
import pandas as pd

invalid_mask = ~rollout_df["valid_submission"]
invalid = rollout_df.loc[invalid_mask, "eval_error_output"]

# Exact message counts (treat NaN as "<<NA>>")
exact_counts = invalid.fillna("<<NA>>").value_counts()

# Helper to get a safe first line for display
def safe_first_line(msg):
    if not isinstance(msg, str):
        return str(msg)
    s = msg.strip()
    if not s:
        return "<<EMPTY>>"
    return s.splitlines()[0][:300]

top5_exact = exact_counts.head(5)
top5_exact_display = [(safe_first_line(msg), int(cnt)) for msg, cnt in top5_exact.items()]
print("Top 5 exact error messages (first line shown) with counts:\n")
for i, (fl, cnt) in enumerate(top5_exact_display, 1):
    print(f"{i}. {fl}  -> count={cnt}")

#

In [34]:
iteration_id = 3

iteration = iterations[iteration_id] if iterations else None

if iteration:
    print(f"=================  Iteration {iteration_id} (#{iteration.get('iteration', 'N/A')}) =====================")
    print(f"Keys in  iteration {iteration_id} : {list(iteration.keys())}")
    

    # Check LLM response
    print(f"\n================= LLM Response in Iteration {iteration_id} =====================")
    response = iteration.get("response", "")
    print(f"Response:\n...{response}" )


    # Check code blocks in iteration
    code_blocks = iteration.get("code_blocks", [])
    print(f"\nNumber of code blocks: {len(code_blocks)}")
    
    for i, block in enumerate(code_blocks):
        print(f"\n======================================== Code Block {i} =======================================")
        # code = block.get("code", "")
        # print(f"\n----------------------- Code Block {i} Code ------------------")

        # print(f"Code: {code}")
        
        # result = block.get("result", {})
        # stdout = result.get("stdout", "")
        # stderr = result.get("stderr", "")
        
        if stdout:
            print(f"\n----------------------- Code Block {i} Stdout ------------------")

            print(f"\nStdout:\n{stdout}" )
        if stderr:
            print(f"\n----------------------- Code Block {i} Stderr ------------------")
            print(f"\nStderr:\n{stderr}")
    

else:
    print("No iterations found!")

Keys in  iteration 3 : ['type', 'iteration', 'timestamp', 'prompt', 'response', 'code_blocks', 'final_answer', 'iteration_time']

Response:
...```repl
# Prepare the final answer by extracting the requested info and citing evidence from code/config.

# 1) Top 5 most common evaluation error messages among invalid submissions
invalid_mask = ~rollout_df["valid_submission"]
invalid = rollout_df.loc[invalid_mask, "eval_error_output"]

def safe_first_line(msg):
    if not isinstance(msg, str):
        return str(msg)
    s = msg.strip()
    if not s:
        return "<<EMPTY>>"
    return s.splitlines()[0][:300]

# We'll present the normalized "first line" view so that near-duplicate long messages are grouped.
first_line_counts = invalid.apply(safe_first_line).value_counts()

top5_norm = list(first_line_counts.head(5).items())

# Also keep the exact top 5 (first-line shown) in case needed
exact_counts = invalid.fillna("<<NA>>").value_counts()
top5_exact = [(safe_first_line(msg), int(cnt)) for 