# RLM Log Analysis Functions

This notebook provides utility functions to extract key data from RLM log files:
- **Final answer**: The agent's concluding response
- **Code blocks**: All code executed during the session
- **RLM calls**: Sub-LLM calls made via `llm_query()` / `llm_query_batched()`

In [2]:
import sys
import importlib


sys.path.append('data/analysis')
import rlm_log_utils
importlib.reload(rlm_log_utils)
from rlm_log_utils import *

## Usage Example

Load the log file and extract key information:

# Load

In [3]:
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_01-19-55_0c5414d6.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_00-40-36_52897c45.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_02-49-49_405164e2.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_03-32-48_4e207f16.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_05-04-09_a2790315.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_05-28-19_99d053cc.jsonl"
# LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_513_2026-02-02_05-48-35_1fcdaf63.jsonl"
LOG_PATH = "/checkpoint/maui_sft/winnieyangwn/rlm_dumps/gpt-5_common_invalid_errors_codebase_513_2026-02-03_09-43-33_7f873403.jsonl"
# Load the log - first entry is metadata, rest are iterations
entries = load_rlm_log(LOG_PATH)
metadata = entries[0]
iterations = entries[1:]

print(f"Loaded {len(iterations)} iterations")

Loaded 5 iterations


# Metadata

In [4]:
# View metadata
print("=== METADATA ===")
for k, v in metadata.items():
    if k != "backend_kwargs":
        print(f"{k}: {v}")

=== METADATA ===
type: metadata
timestamp: 2026-02-03T09:43:33.745971
root_model: gpt-5
max_depth: 2
max_iterations: 100
backend: azure_openai
environment_type: local
environment_kwargs: {'setup_code': '\nimport pandas as pd\nfrom pathlib import Path\n\n# Load rollout data as DataFrame\nrollout_df = pd.read_json(\'/checkpoint/maui_sft/winnieyangwn/amaia_dumps/513/trajectories/513_metadata.jsonl\', lines=True)\n\n# Load codebase into dict\ncodebase = {}\ncodebase_root = Path(\'/checkpoint/agentic-models/winnieyangwn/amaia_dumps/503/code/2026_02_02_00_55_44\')\nfor ext in [\'.py\', \'.md\', \'.yaml\']:\n    for path in codebase_root.rglob(f\'*{ext}\'):\n        try:\n            rel_path = str(path.relative_to(codebase_root))\n            codebase[rel_path] = path.read_text(errors=\'ignore\')\n        except Exception:\n            pass  # Skip unreadable files\n\n# Load config YAML\nconfig_yaml = \'\'\'# gpt5 after rate limit fix + using jupyter eval as baseline\n# python -m launchers.s

In [5]:

# Compare with timestamp-based runtime
runtime = get_total_runtime(entries)
print(f"Timestamp-based runtime: {runtime.total_seconds():.2f}s")

Timestamp-based runtime: 182.70s


# Final Answer

In [6]:
# Get the final answer
final_answer = get_final_answer(iterations)
print("=== FINAL ANSWER ===")
print(final_answer if final_answer else "No final answer found")
# print(f"\n(Total length: {len(final_answer) if final_answer else 0} chars)")

=== FINAL ANSWER ===
Error: Variable 'final_answer' not found


# Iteration 1

In [58]:
iterations[0]["code_blocks"][0]["code"]

'# Explore the DataFrame structure and compute the answer\nimport pandas as pd\n\n# Basic checks\nn_rows = len(context)\ncols = list(context.columns)\nvalid_counts = context["valid_submission"].value_counts(dropna=False)\n\n# Filter invalid submissions\ninvalid = context[context["valid_submission"] == False]\n\n# Get top 5 most common evaluation error messages (strip whitespace, drop missing)\ntop_errors = (\n    invalid["eval_error_output"]\n    .dropna()\n    .map(lambda s: s.strip())\n    .value_counts()\n    .head(5)\n)\n\nprint("Total rows:", n_rows)\nprint("Columns:", cols)\nprint("Valid vs Invalid counts:\\n", valid_counts)\nprint("\\nTop 5 most common eval_error_output among invalid submissions:")\nprint(top_errors.to_string())'

In [59]:
iterations[0]["code_blocks"][0]["result"]["locals"].keys()

dict_keys(['f', 'context_0', 'context', 'pd', 'n_rows', 'cols', 'valid_counts', 'invalid', 'top_errors'])

In [60]:
len(iterations[0]["code_blocks"][0]["result"]["locals"]["context_0"])

0

In [61]:
iterations[0]["iteration_time"]

14.657848205417395

# Iteration #2

In [78]:
iterations[1]["iteration_time"]

26.49644836317748

In [65]:
iterations[1]["response"]

'```repl\nimport pandas as pd\nimport re\n\n# Recompute value counts for safety\ninvalid_mask = context["valid_submission"] == False\ninvalid_df = context.loc[invalid_mask, ["eval_error_output"]].copy()\ninvalid_df["eval_error_output_norm"] = (\n    invalid_df["eval_error_output"]\n    .astype(str)\n    .str.strip()\n)\n\nclean_invalid = invalid_df[\n    (invalid_df["eval_error_output_norm"].notna()) &\n    (invalid_df["eval_error_output_norm"] != "") &\n    (invalid_df["eval_error_output_norm"] != "None") &\n    (invalid_df["eval_error_output_norm"] != "nan")\n]\n\nvc = clean_invalid["eval_error_output_norm"].value_counts()\ntop5 = vc.head(5)\n\ndef summarize_message(msg: str) -> str:\n    # If it\'s the common IPython SystemExit message, just keep "SystemExit: X"\n    if msg.startswith("SystemExit:"):\n        return msg.splitlines()[0].strip()\n    # If it\'s the Kaggle validation wrapper, extract the actual error line after the preamble\n    if msg.startswith("Validation error: Sub

'Among invalid submissions (n=1,890), the top 5 most common evaluation error messages are:\n- SystemExit: 2 — 1,669\n- Error tokenizing data. C error: Expected 141 fields in line 3, saw 334 — 49\n- <empty> — 23\n- ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2. — 16\n- RuntimeError: Encountered NaN. — 11'

In [21]:
# Get all code blocks
code_blocks = get_all_code_with_results(iterations)
print(f"=== CODE BLOCKS ({len(code_blocks)} total) ===\n")

for i, block in enumerate(code_blocks[:3]):  # Show first 3
    print(f"--- Block {i+1} (Iteration {block['iteration']}) ---")
    print(block["code"][:500])
    if block.get("stdout"):
        print(f"\n[stdout]: {block['stdout'][:200]}...")
    print()

=== CODE BLOCKS (1 total) ===

--- Block 1 (Iteration 1) ---
total = len(context)
valid = sum(1 for r in context if r.get("valid_submission"))
percentage = (valid / total) * 100
print(f"Total rollouts: {total}, Valid submissions: {valid}, Percentage: {percentage:.2f}%")

[stdout]: Total rollouts: 4800, Valid submissions: 3806, Percentage: 79.29%
...



In [23]:
# Get RLM calls summary
summary = get_sub_rlm_calls_summary(iterations)
print("=== RLM CALLS SUMMARY ===")
print(f"Total sub-LLM calls: {summary['total_calls']}")
print(f"Total input tokens: {summary['total_input_tokens']:,}")
print(f"Total output tokens: {summary['total_output_tokens']:,}")
print(f"Models used: {summary['models_used']}")
print(f"Calls per iteration: {summary['calls_by_iteration']}")

=== RLM CALLS SUMMARY ===
Total sub-LLM calls: 0
Total input tokens: 0
Total output tokens: 0
Models used: []
Calls per iteration: {}


In [24]:
# Get detailed RLM calls
rlm_calls = get_sub_rlm_calls(iterations)
print(f"=== RLM CALLS DETAIL ({len(rlm_calls)} calls) ===\n")

for i, call in enumerate(rlm_calls[:2]):  # Show first 2 calls
    print(f"--- Call {i+1} (Iteration {call['iteration']}, Block {call['code_block_idx']}) ---")
    print(f"Model: {call['root_model']}")
    print(f"Execution time: {call['execution_time']:.2f}s")
    prompt_preview = str(call['prompt'])[:300]
    print(f"Prompt preview: {prompt_preview}...")
    response_preview = call['response'][:300] if call['response'] else "None"
    print(f"Response preview: {response_preview}...")
    print()

=== RLM CALLS DETAIL (0 calls) ===



In [25]:
# Extract everything at once
all_data = extract_all(LOG_PATH)
print("=== FULL EXTRACTION ===")
print(f"Metadata keys: {list(all_data['metadata'].keys()) if all_data['metadata'] else 'None'}")
print(f"Number of iterations: {all_data['num_iterations']}")
print(f"Number of code blocks: {len(all_data['code_blocks'])}")
print(f"Number of RLM calls: {len(all_data['rlm_calls'])}")
print(f"Has final answer: {all_data['final_answer'] is not None}")

=== FULL EXTRACTION ===
Metadata keys: ['type', 'timestamp', 'root_model', 'max_depth', 'max_iterations', 'backend', 'backend_kwargs', 'environment_type', 'environment_kwargs', 'other_backends']
Number of iterations: 2
Number of code blocks: 1
Number of RLM calls: 0
Has final answer: True
