In [20]:
import re

def extract_success_rate(log_content):
    # Find all result lines with PASS or FAIL
    results = re.findall(r"\[Result\] \((PASS|FAIL)\) (.+?)$", log_content, re.MULTILINE)
    
    # Count the total and successful tasks
    total_tasks = len(results)
    successful_tasks = sum(1 for status, _ in results if status == "PASS")
    first_50_tasks = sum(1 for status, _ in results[:50] if status == "PASS")
    successful_1hop_tasks = sum(1 for status, _ in results[:200] if status == "PASS")
    successful_2hop_tasks = sum(1 for status, _ in results[200:] if status == "PASS")
    
    # Calculate success rate for each task and overall
    task_success = {}
    for status, task_path in results:
        task_success[task_path] = 1 if status == "PASS" else 0
    
    # Calculate average success rate
    average_success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0
    average_50_success_rate = first_50_tasks / 50 if total_tasks > 0 else 0
    average_1hop_success_rate = successful_1hop_tasks / 200 if total_tasks > 0 else 0
    average_2hop_success_rate = successful_2hop_tasks / (total_tasks - 200) if total_tasks > 200 else 0
    
    # print(f"Average 50 Success Rate: {average_50_success_rate:.2%} (Tasks: {first_50_tasks}/50)")
    # print(f"Average 1-hop Success Rate: {average_1hop_success_rate:.2%} (Tasks: {successful_1hop_tasks}/200)")
    # print(f"Average 2-hop Success Rate: {average_2hop_success_rate:.2%} (Tasks: {successful_2hop_tasks}/{total_tasks - 200})")
    print(f"Average Overall Success Rate: {average_success_rate:.2%} (Tasks: {successful_tasks}/{total_tasks})")

In [29]:
# Qwen2.5-VL original
print("-------------- Wikipedia --------------")
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250807_010624.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)
print("-------------- Shopping --------------")
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250811_042607.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)
print("-------------- Shopping (with caption)--------------")
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250812_005024.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)
print("-------------- Compare --------------")
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250812_005442.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)

-------------- Wikipedia --------------
Average 100 Success Rate: 38.00% (Tasks: 38/100)
Average 1-hop Success Rate: 35.00% (Tasks: 70/200)
Average 2-hop Success Rate: 7.41% (Tasks: 8/108)
Average Overall Success Rate: 25.32% (Tasks: 78/308)
-------------- Shopping --------------
Average 100 Success Rate: 4.00% (Tasks: 4/100)
Average 1-hop Success Rate: 5.50% (Tasks: 11/200)
Average 2-hop Success Rate: 0.00% (Tasks: 0/0)
Average Overall Success Rate: 5.50% (Tasks: 11/200)
-------------- Shopping (with caption)--------------
Average 100 Success Rate: 13.00% (Tasks: 13/100)
Average 1-hop Success Rate: 16.00% (Tasks: 32/200)
Average 2-hop Success Rate: 0.00% (Tasks: 0/0)
Average Overall Success Rate: 16.00% (Tasks: 32/200)
-------------- Compare --------------
Average 100 Success Rate: 0.00% (Tasks: 0/100)
Average 1-hop Success Rate: 0.00% (Tasks: 0/200)
Average 2-hop Success Rate: 0.00% (Tasks: 0/-104)
Average Overall Success Rate: 0.00% (Tasks: 0/96)


In [None]:
# Qwen2.5-VL with self-plan
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250808_005545.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)

Average 1-hop Success Rate: 30.50% (Tasks: 61/200)
Average 2-hop Success Rate: 8.33% (Tasks: 9/108)
Average Overall Success Rate: 22.73% (Tasks: 70/308)


In [4]:
# Qwen2.5-VL with self-plan-subtask
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250808_073629.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)

Average 1-hop Success Rate: 32.50% (Tasks: 65/200)
Average 2-hop Success Rate: 7.41% (Tasks: 8/108)
Average Overall Success Rate: 23.70% (Tasks: 73/308)


In [None]:
# Qwen2.5-VL original with additional tools 
with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250810_110847.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)

Average 100 Success Rate: 53.00% (Tasks: 53/100)
Average 1-hop Success Rate: 27.50% (Tasks: 55/200)
Average 2-hop Success Rate: 0.00% (Tasks: 0/-96)
Average Overall Success Rate: 52.88% (Tasks: 55/104)


In [21]:

with open("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/log_20250817_154105.log", 'r') as file:
    log_content = file.read()
extract_success_rate(log_content)

Average Overall Success Rate: 26.83% (Tasks: 11/41)


In [20]:
import os 
import json

for file in os.listdir("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/mmina/multi567"):
    with open(f"/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/mmina/multi567/{file}", 'r') as f:
        data = json.load(f)
    data['start_url'] = "https://www.wikipedia.org/"
    with open(f"/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/mmina/multi567/{file}", 'w') as f:
        json.dump(data, f, indent=4)

In [22]:
import os

all_files = os.listdir("/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files")

for file in all_files:
    with open(f"/lustre/scratch/users/guangyi.liu/agent/GUI-Agent/log_files/{file}", 'r') as f:
        log_content = f.read()
        first_line = log_content.split("\n")[0]
    if 'shopping' in first_line:
        print(file)
        print(first_line)

        results = re.findall(r"\[Result\] \((PASS|FAIL)\) (.+?)$", log_content, re.MULTILINE)

        print(results)

log_20250817_210740.log
2025-08-17 21:07:40,629 - INFO - Configuration saved to results/shopping_qwen2.5-vl/config.json
[('FAIL', 'mmina/shopping/1.json'), ('FAIL', 'mmina/shopping/2.json'), ('PASS', 'mmina/shopping/3.json'), ('FAIL', 'mmina/shopping/4.json')]
log_20250817_154105.log
2025-08-17 15:41:05,721 - INFO - Configuration saved to results/shopping_gpt-4o/config.json
[('FAIL', 'mmina/shopping/1.json'), ('FAIL', 'mmina/shopping/2.json'), ('FAIL', 'mmina/shopping/3.json'), ('FAIL', 'mmina/shopping/4.json'), ('PASS', 'mmina/shopping/5.json'), ('FAIL', 'mmina/shopping/6.json'), ('PASS', 'mmina/shopping/7.json'), ('FAIL', 'mmina/shopping/8.json'), ('FAIL', 'mmina/shopping/9.json'), ('FAIL', 'mmina/shopping/10.json'), ('FAIL', 'mmina/shopping/11.json'), ('FAIL', 'mmina/shopping/12.json'), ('FAIL', 'mmina/shopping/13.json'), ('FAIL', 'mmina/shopping/14.json'), ('PASS', 'mmina/shopping/15.json'), ('PASS', 'mmina/shopping/16.json'), ('PASS', 'mmina/shopping/17.json'), ('PASS', 'mmina/sho