In [None]:
# nohup papermill new_intent.ipynb output.ipynb > output.log 2>&1 &
# jupyter nbconvert --to notebook --execute new_intent.ipynb

In [1]:
import os
import gc
import json
import yaml
import torch
from PIL import Image
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

import importlib
import help_functions
importlib.reload(help_functions)
from help_functions import *

In [2]:
from peft import PeftModel
from qwen_vl_utils import process_vision_info

max_pixels = 262144     # Inference 与 Train 保持一致
torch_dtype = torch.float16


# from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
# base_model_path = "../../OS-Atlas-main/models/OS-Atlas-Base-7B"
# lora_weights_path = "../../OS-Atlas-main/saves/android_world/OS-Atlas-InnovAll-Iter2"  # 要利用迭代后的模型 

# # 加载基础模型
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     base_model_path,
#     torch_dtype=torch_dtype,
#     device_map="auto"
# )

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
base_model_path = "../../GUI-R1/models/GUI-R1-7B"
lora_weights_path = "../../GUI-R1/saves/android_world/GUI-R1-InnovAll-Iter2"  # 要利用迭代后的模型 

# 加载基础模型
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    base_model_path,
    torch_dtype=torch_dtype,
    device_map="auto"
)

# 加载 LoRA 权重并应用到基础模型
model = PeftModel.from_pretrained(
    model,
    lora_weights_path,
    torch_dtype=torch_dtype,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(
    base_model_path,
    max_pixels=max_pixels
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
def get_messages(prompt, text):
    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": prompt,
                }
            ],
        },
        {
            "role": "user",
            "content": [],
        }
    ]
    
    messages[1]["content"].append({
        "type": "text",
        "text": text,
    })
    
    return messages

def get_messages_new(prompt, messages, front_text=None, back_text=None):
    # 构造系统消息
    system_message = {
        "role": "system",
        "content": [
            {"type": "text", "text": prompt},
        ]
    }
    
    # 将系统消息放到最前面
    messages_with_prompt = [system_message] + messages

    # 如果指定了 front_text 或 back_text，就在第一个 user 消息里插入
    if front_text or back_text:
        for message in messages_with_prompt:
            if message['role'] == 'user':
                # 在最前面插入 front_text
                if front_text is not None:
                    message['content'].insert(
                        0,
                        {"type": "text", "text": front_text}
                    )
                # 在末尾追加 back_text
                if back_text is not None:
                    message['content'].append(
                        {"type": "text", "text": back_text}
                    )
                break  # 只处理第一个 user 消息

    return messages_with_prompt

def get_response(messages):
    # Prepare the input for the model
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    # Generate output
    generated_ids = model.generate(**inputs, max_new_tokens=512)

    # Post-process the output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )
    return output_text[0]

In [4]:
# dir = "json_files/OS-Atlas/InnovAll/Iter3"
# result_path_template = "../runs/OS-Atlas-InnovAll-Iter2-Sample/iter{i}"
# file_path_template = "../runs/OS-Atlas-InnovAll-Iter2-Sample/iter{i}/{name}_0.pkl.gz"

dir = "json_files/GUI-R1/InnovAll/Iter3"
result_path_template = "../runs/GUI-R1-InnovAll-Iter2-Sample/iter{i}"
file_path_template = "../runs/GUI-R1-InnovAll-Iter2-Sample/iter{i}/{name}_0.pkl.gz"

with open(f'{dir}/sample_tasks_completion_lf_after.json', 'r') as json_file:
    success_tasks = json.load(json_file)
with open('prompt.yaml', 'r') as file:
    prompts = yaml.safe_load(file)
prompt_new_intent = prompts['prompt_new_intent']
prompt_refine_intent = prompts['prompt_refine_intent']

In [5]:
# 这里转化 单个任务 为 message 格式用来为模型提取 专家轨迹 中的关键信息提供语境
def convert_trajectories_to_messages_format(file_path):
    messages_data = []
    template = "\nOBSERVATION: {observation}\nACTION: {action}"

    trajectory = get_trajectory(file_path, get_screenshot=True)
    if trajectory is None:
        return None

    for i in range(len(trajectory['action_output_raw'])):
        back_part = template.format(
            observation=trajectory['element_lists'][i],
            action=trajectory['action_output_raw'][i]
        )
        image = trajectory['screenshot'][i]
        # Prepare the multimodal content for the model

        # 构建 messages
        messages = {
                "role": "user",
                "content": [
                    {"type": "text", "text": "SCREENSHOT:"},
                    {"type": "image", "image": Image.fromarray(image),},
                    {"type": "text", "text": back_part},],
                }
        messages_data.append(messages)

    return messages_data

In [6]:
new_intents = {}
for name, task_completion in success_tasks.items():
    new_intents[name] = {}
    for i, if_pass in enumerate(task_completion):
        n = i + 1
        # if name != "ClockTimerEntry" or n != 5:
        #     continue
        if if_pass:  # 只筛选失败的轨迹
            continue

        print(f"Processing {name}-{n} ...")
        file_path = file_path_template.format(i=n, name=name)

        invalid_bool = False
        try:
            messages = convert_trajectories_to_messages_format(file_path)
            if messages is None:
                continue

            # 首先第一步：生成 New Task Intent
            front_text = "Trajectory:\n"
            back_text = "\nNew task intent:"
            msgs = get_messages_new(prompt_new_intent, messages, front_text, back_text)
            candidate_intent = get_response(msgs).replace("<|im_end|>", "").replace("```", "")
            print(f"✅ Got intent: {candidate_intent}")

            # 第二步：进一步的质量检验
            if len(candidate_intent) > 1000:
                result = "INVALID"
                invalid_bool = True
                print("Exceeds length limit.")
            else:
                # 2) 否则调用模型 refine
                text = f"{candidate_intent}\n"
                msgs = get_messages(prompt_refine_intent, text)
                result = get_response(msgs).replace("<|im_end|>", "")
                if len(result.split()) <= 2 or "INVALID" in result:
                    invalid_bool = True
                print("Refine intent:", result)

        except Exception as e:
            # 打印错误但不中断循环
            print(f"  🔴 Error at {name}-{n}: {e}")
            invalid_bool = True

        # 仅当不是 INVALID 时，才写入
        if not invalid_bool:
            new_intents[name][str(n)] = result

        # 每次循环后都清理一下，以防显存泄露
        del msgs
        torch.cuda.empty_cache()
        gc.collect()
        print()

# 最后写入文件
with open(f"{dir}/intents_new.json", 'w', encoding='utf-8') as file:
    json.dump(new_intents, file, ensure_ascii=False, indent=4)

Processing CameraTakeVideo-1 ...
✅ Got intent: SCREENSHOT:  
15:34
7:G
4
6
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3
2
1
0
5
4
3

Refine intent: INVALID

Processing CameraTakeVideo-2 ...
✅ Got intent: The trajectory indicates that the user has successfully completed the task of scrolling through the app drawer to a specific app and initiating an action on it. The task intent is to open the app "YouTube" and proceed with the action of checking its information. The observation confirms that the app "YouTube" is currently in the process of checking its info

Token indices sequence length is longer than the specified maximum sequence length for this model (210521 > 131072). Running this sequence through the model will result in indexing errors


  🔴 Error at RecipeDeleteMultipleRecipesWithConstraint-2: CUDA out of memory. Tried to allocate 160.18 GiB. GPU 0 has a total capacity of 47.40 GiB of which 28.18 GiB is free. Including non-PyTorch memory, this process has 19.20 GiB memory in use. Of the allocated memory 18.53 GiB is allocated by PyTorch, and 362.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Processing RecipeDeleteSingleWithRecipeWithNoise-4 ...
  🔴 Error at RecipeDeleteSingleWithRecipeWithNoise-4: CUDA out of memory. Tried to allocate 40.05 GiB. GPU 0 has a total capacity of 47.40 GiB of which 36.03 GiB is free. Including non-PyTorch memory, this process has 11.35 GiB memory in use. Of the allocated memory 10.78 GiB is allocated by PyTorch, and 253.72 MiB is reserved by PyTorch