In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import importlib
import import_ipynb
generate_key_nodes = importlib.import_module("1_generate_key_nodes")
generate_label_functions = importlib.import_module("2_generate_label_functions")
organize_codes = importlib.import_module("3_organize_codes")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from datetime import datetime
import importlib
import help_functions
importlib.reload(help_functions)
from help_functions import *

In [3]:
import json

# dir = "json_files/OS-Atlas/InnovAll/Iter3"
# result_path_template = "../runs/OS-Atlas-InnovAll-Iter2-Sample/iter{i}"
# file_path_template = "../runs/OS-Atlas-InnovAll-Iter2-Sample/iter{i}/{name}_0.pkl.gz"

dir = "json_files/GUI-R1/InnovAll/Iter3"
result_path_template = "../runs/GUI-R1-InnovAll-Iter2-Sample/iter{i}"
file_path_template = "../runs/GUI-R1-InnovAll-Iter2-Sample/iter{i}/{name}_0.pkl.gz"

get_success_tasks_for_self_training(dir, result_path_template)

with open(f'{dir}/tasks.json', 'r') as f:
    tasks = json.load(f)
with open(f'{dir}/key_nodes.json', 'r') as f:
    key_nodes = json.load(f)
with open(f'{dir}/sample_tasks_completion.json', "r", encoding="utf-8") as f:
    results_raw = json.load(f)

正在计算第1轮
正在计算第2轮
正在计算第3轮
正在计算第4轮
正在计算第5轮


In [4]:
def calculate_performance_metrics(json_file_path, type = "-") -> dict:
    # Initialize result dictionary
    final_result = {
        "time": datetime.now().strftime("%Y-%m-%dT%H:%M"),
        "type": type,
        "metrics": {}
    }
    
    with open(json_file_path, "r") as file:
        result_lf_all = json.load(file)
    
    # Initialize metrics for each environment and total
    total_metrics = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}
    final_result["metrics"] = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}
    
    # Calculate TP, TN, FP, FN for each environment
    for name in result_lf_all.keys():
        lf_results = result_lf_all[name]
        raw_results = results_raw[name]
        # Compare the 5 boolean values
        for lf, raw in zip(lf_results, raw_results):
            if lf and raw:  # True positive
                final_result["metrics"]["TP"] += 1
            elif not lf and not raw:  # True negative
                final_result["metrics"]["TN"] += 1
            elif lf and not raw:  # False positive
                print("FP:", name)
                final_result["metrics"]["FP"] += 1
            elif not lf and raw:  # False negative
                final_result["metrics"]["FN"] += 1
    
    # Calculate total metrics
    TP = final_result["metrics"]["TP"]
    TN = final_result["metrics"]["TN"]
    FP = final_result["metrics"]["FP"]
    FN = final_result["metrics"]["FN"]
    total = TP + TN + FP + FN
        
    accuracy = (TP + TN) / total if total > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    final_result["metrics"].update({
        "accuracy": round(accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1": round(f1, 3)
    })
    
    # Append result to JSON file
    output_json_path = f"{dir}/label_function_results.json"
    if os.path.exists(output_json_path):
        with open(output_json_path, "r") as file:
            data = json.load(file)
    else:
        data = []
    data.append(final_result)
    with open(output_json_path, "w") as file:
        json.dump(data, file, indent=4)
    
    return final_result

def get_result_from_graph(name, true_node):
    # Get the graph structure
    graph = tasks[name]["key_nodes"]
    
    # If true_node is empty or graph is empty, no valid path is possible
    if not true_node or not graph:
        return False
    
    def find_path(current_node, visited):
        """
        Recursive DFS to find if a path exists from current_node to an end node.
        """
        # 检查是否是 end node
        if "-1" in graph.get(current_node, {}).get("child_node_id", []):
            return True
        
        # Get child nodes
        children = graph.get(current_node, {}).get("child_node_id", [])
        for child in children:
            if child != "-1" and child in true_node and child not in visited:
                visited.add(child)
                if find_path(child, visited):
                    return True
                visited.remove(child)
        
        return False
    
    # 找到原有 graph 中所有的 start node
    start_nodes = [node_id for node_id in graph if "0" in graph[node_id].get("parent_node_id", [])]

    # Try each start node to find a path to an end node
    for start_node in start_nodes:
        if start_node in true_node:
            if find_path(start_node, {start_node}):
                return True
    
    return False

def process_single_file(file_path, name, index, key_nodes):
    print(name, index)
    result_raw = results_raw[name][int(index)-1]
    print("raw result: ", result_raw)

    trajectory = get_trajectory(file_path)
    if trajectory is None:
        return result_raw, False, False

    results = []
    true_node = []
    false_node = []
    key_nodes_idxs = tasks[name]["key_nodes"].keys()

    for key_node_idx in key_nodes_idxs:
        matching_node = next((node for node in key_nodes if str(node['id']) == str(key_node_idx)), None)
        node_id = matching_node['id']
        label_function = matching_node['label_function']
        
        try:
            params = {"trajectory": trajectory}
            exec(label_function, params)
            result = params.get('result', False)
        except Exception as e:
            result = False
            print(f"Error occurred: {str(e)}", flush=True)
        
        print(result)
        results.append(result)
        if result:
            true_node.append(node_id)
        else:
            false_node.append(node_id)

    result_lf = get_result_from_graph(name, true_node)
    print("lf result: ", result_lf)
    
    # 判断是否有潜力的轨迹
    if len(key_nodes_idxs) == 0 or len(key_nodes_idxs) == 1:
        is_potential = (
            result_raw is True and          # 条件 1: result_raw 为 True
            result_lf is False              # 条件 2: result_lf 为 False
        )
    else:
        is_potential = (
            result_raw is True and          # 条件 1: result_raw 为 True
            result_lf is False and          # 条件 2: result_lf 为 False
            any(results) is True            # 条件 3: results 中至少一个为 True
        )

    return result_raw, result_lf, is_potential

In [5]:
result_lf_all = {}
TP = TN = FP = FN = 0
potential_tasks = []
sample_num = 5

for name in tasks.keys():
    result_lf_all[name] = []
    for index in range(1, sample_num+1):
        # if name != "SimpleCalendarLocationOfEvent":
        #     continue
        file_path = file_path_template.format(i=index, name=name)

        result_raw, result_lf, is_potential = process_single_file(file_path, name, index, key_nodes)
        print()
        
        if is_potential:
            potential_tasks.append(name + "-" + str(index))
        result_lf_all[name].append(result_lf)

# 保存 result_all 到 JSON 文件
completion_file = f'{dir}/sample_tasks_completion_lf_before.json'
with open(completion_file, 'w', encoding='utf-8') as f:
    json.dump(result_lf_all, f, ensure_ascii=False, indent=4)

with open(f'{dir}/potential_tasks.json', 'w', encoding='utf-8') as f:
    json.dump(potential_tasks, f, ensure_ascii=False, indent=4)

# 大概需要25分钟
calculate_performance_metrics(completion_file, type="before")

CameraTakeVideo 1
raw result:  False
True
False
True
False
lf result:  False

CameraTakeVideo 2
raw result:  False
action无效: {"action_type": "click", "index": 10}
True
True
True
False
lf result:  False

CameraTakeVideo 3
raw result:  False
True
True
True
False
lf result:  False

CameraTakeVideo 4
raw result:  False
action无效: {"action_type": "click", "index": 10}
True
True
True
False
lf result:  False

CameraTakeVideo 5
raw result:  False
True
True
True
False
lf result:  False

ClockStopWatchRunning 1
raw result:  False
False
False
False
lf result:  False

ClockStopWatchRunning 2
raw result:  False
False
False
False
lf result:  False

ClockStopWatchRunning 3
raw result:  True
True
True
True
lf result:  True

ClockStopWatchRunning 4
raw result:  True
True
True
True
lf result:  True

ClockStopWatchRunning 5
raw result:  False
True
True
False
lf result:  False

ClockTimerEntry 1
raw result:  False
False
True
True
True
True
True
True
True
False
lf result:  False

ClockTimerEntry 2
raw resul

{'time': '2025-07-25T10:51',
 'type': 'before',
 'metrics': {'TP': 38,
  'TN': 44,
  'FP': 0,
  'FN': 3,
  'accuracy': 0.965,
  'precision': 1.0,
  'recall': 0.927,
  'f1': 0.962}}

### 扩展开始

In [6]:
import json
with open(f'{dir}/potential_tasks.json', "r", encoding="utf-8") as f:
    potential_tasks = json.load(f)

for task in potential_tasks:
    name, index = task.split('-')
    print(name, index)

    file_path = file_path_template.format(i=index, name=name)
    trajectory = get_trajectory(file_path)

    if trajectory is None:
        print()
        continue

    # 产生 Key Node
    key_nodes, objective, template = generate_key_nodes.get_key_node(file_path)
    print(objective, key_nodes)
    
    # 产生 Label Function
    output_file = "temp"
    generate_label_functions.get_label_function(name, key_nodes, trajectory, output_file)

    # 扩展 Label Function Graph
    organize_codes.add_key_node(name, key_nodes, output_file, dir)
    print()

ClockTimerEntry 4
Action List: 
Open an app named 'Clock'.
Click on a UI element 'Timer' on the screen.
Click on a UI element '1' on the screen.
Click on a UI element '6' on the screen.
Click on a UI element '3' on the screen.
Click on a UI element '5' on the screen.
Set the task's status as 'complete'.
提取 key_actions:
Open an app named 'Clock'.
Click on a UI element 'Timer' on the screen.
Click on a UI element '1' on the screen.
Click on a UI element '6' on the screen.
Click on a UI element '3' on the screen.
Click on a UI element '5' on the screen.
Set the task's status as 'complete'.
Create a timer with 0 hours, 16 minutes, and 35 seconds. Do not start the timer. ["Open an app named 'Clock'.", "Click on a UI element 'Timer' on the screen.", "Click on a UI element '1' on the screen.", "Click on a UI element '6' on the screen.", "Click on a UI element '3' on the screen.", "Click on a UI element '5' on the screen.", "Set the task's status as 'complete'."]
Open an app named 'Clock'.
Res

In [7]:
import json

# dir = "json_files/OS-Atlas/Innov1/Iter1"
# dir = "json_files/GUI-R1/InnovAll/Iter2"
with open(f'{dir}/tasks.json', 'r') as f:
    tasks = json.load(f)
with open(f'{dir}/key_nodes.json', 'r') as f:
    key_nodes = json.load(f)
with open(f'{dir}/sample_tasks_completion.json', "r", encoding="utf-8") as f:
    results_raw = json.load(f)

result_lf_all = {}
TP = TN = FP = FN = 0
potential_tasks = []
sample_num = 5

for name in tasks.keys():
    result_lf_all[name] = []
    for index in range(1, sample_num+1):
        # if name != "SystemWifiTurnOn" or index != 2:
        #     continue
        file_path = file_path_template.format(i=index, name=name)
        result_raw, result_lf, is_potential = process_single_file(file_path, name, index, key_nodes)
        print()
        result_lf_all[name].append(result_lf)

# 保存 result_all 到 JSON 文件
completion_file = f'{dir}/sample_tasks_completion_lf_after.json'
with open(completion_file, 'w', encoding='utf-8') as f:
    json.dump(result_lf_all, f, ensure_ascii=False, indent=4)

# 大概需要25分钟
calculate_performance_metrics(completion_file, type="after")

CameraTakeVideo 1
raw result:  False


True
False
True
False
lf result:  False

CameraTakeVideo 2
raw result:  False
action无效: {"action_type": "click", "index": 10}
True
True
True
False
lf result:  False

CameraTakeVideo 3
raw result:  False
True
True
True
False
lf result:  False

CameraTakeVideo 4
raw result:  False
action无效: {"action_type": "click", "index": 10}
True
True
True
False
lf result:  False

CameraTakeVideo 5
raw result:  False
True
True
True
False
lf result:  False

ClockStopWatchRunning 1
raw result:  False
False
False
False
lf result:  False

ClockStopWatchRunning 2
raw result:  False
False
False
False
lf result:  False

ClockStopWatchRunning 3
raw result:  True
True
True
True
lf result:  True

ClockStopWatchRunning 4
raw result:  True
True
True
True
lf result:  True

ClockStopWatchRunning 5
raw result:  False
True
True
False
lf result:  False

ClockTimerEntry 1
raw result:  False
False
True
True
True
True
True
True
False
True
False
lf result:  False

ClockTimerEntry 2
raw result:  False
False
True
True
True


{'time': '2025-07-25T10:53',
 'type': 'after',
 'metrics': {'TP': 41,
  'TN': 44,
  'FP': 0,
  'FN': 0,
  'accuracy': 1.0,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0}}