In [None]:
!!pip install dspy
!!pip install --upgrade dspy

In [None]:
result_path = "/content/drive/MyDrive/arbigent/arbigent-results/feedback-result"

In [None]:
import dspy
lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)

In [None]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [None]:
import os
import json
import yaml
from pprint import pprint
import dspy
from dspy import Example
from dspy.teleprompt import COPRO
from dspy.primitives import Prediction

class ActionSchema(dspy.Signature):
    """Schema for generating agent actions"""
    image_description = dspy.OutputField(desc="Detailed description of current screen")
    memo = dspy.OutputField(desc="Notes about the next action to take")
    action = dspy.OutputField(
        desc="Action to execute",
        choices=["ClickWithIndex", "InputText", "BackPress", "KeyPress", "Scroll", "Wait", "GoalAchieved", "Failed"]
    )
    text = dspy.OutputField(desc="Additional text for the action", nullable=True)

class AgentSignature(dspy.Signature):
    """Take a deep breath. You are an agent that achieves the user's goal automatically. Please don't do anything the user doesn't want to do. Please be careful not to repeat the same action. It's better to achieve users' goals with the fewest number of actions."""
    image = dspy.InputField(desc="Base64 encoded image of the current screen", type=dspy.Image)
    text = dspy.InputField(desc="User's input text including goal and UI state")
    output:ActionSchema = dspy.OutputField(type=ActionSchema, desc="Generated action")


def load_training_data(yml_path, jsonls_dir):
    """Load training data from YAML and JSONL files"""
    with open(yml_path, "r") as f:
        data = yaml.safe_load(f)

    step_ids = [fb['stepId'] for fb in data.get("stepFeedbacks", []) if fb.get("type") == "Good"]

    examples = []
    for step_id in step_ids:
        jsonl_path = os.path.join(jsonls_dir, f"{step_id}.jsonl")
        if not os.path.exists(jsonl_path):
            continue

        with open(jsonl_path, "r") as f:
            for line in f:
                try:
                    record = json.loads(line)

                    user_msg = next(m for m in record["requestBody"]["messages"] if m["role"] == "user")

                    # Get image URL (base64)
                    image_content = next(c for c in user_msg["content"] if c["type"] == "image_url")
                    image = dspy.Image.from_url(image_content["image_url"]["url"])

                    text_content = next(c for c in user_msg["content"] if c["type"] == "text")
                    text = text_content["text"]

                    response = json.loads(record["responseBody"]["choices"][0]["message"]["content"])

                    examples.append(Example(
                        image=image,
                        text=text,
                        output=response
                    ).with_inputs("image", "text"))

                except Exception as e:
                    print(f"Error processing {step_id}: {str(e)}")
                    continue

    return examples

def action_validator(example, pred, trace=None):
    # Response comparison
    expected = example.output
    print("raw output", pred.output)
    predicted = pred.output
    # predicted = json.loads(pred.output)
    # print("parsed output", predicted)

    # Action match check
    action_match = (expected["action"].lower() == predicted.action.lower())

    # Text field partial match
    text_match = (str(expected.get("text", "")).lower() == str(predicted.text).lower())
    print("predicted image_description:", predicted.image_description)
    print("expected image_description:", expected.get("image_description", ""))
    if(not(action_match)):
        print("action_match: ", action_match)
        print("expected: ", expected["action"])
        print("predicted: ", predicted.action)
    if(not(text_match)):
        print("text_match: ", text_match)
        print("expected: ", expected.get("text", "").lower() )
        print("predicted: ", predicted.text.lower())

    return action_match and text_match


base_optimizer = dspy.Predict(AgentSignature)

def optimize_prompt(yml_path, jsonls_dir):

    train_examples = load_training_data(yml_path, jsonls_dir)


    optimizer = COPRO(
        metric=action_validator,
        prompt_model=dspy.LM('openai/gpt-4o-mini'),
        breadth=4,
        depth=4,
        init_temperature=1.2,
        max_retries=5,
    )

    # Run optimization
    optimized_module = optimizer.compile(
        base_optimizer,
        trainset=train_examples,
        eval_kwargs={'num_threads': 1, 'display_progress': True}
    )

    return optimized_module

if __name__ == "__main__":
    yml_path = result_path + "/result.yml"
    jsonls_dir = result_path + "/jsonls"

    optimized = optimize_prompt(yml_path, jsonls_dir)

    optimized.save("optimized_prompt.json")


In [28]:
pprint(base_optimizer)
pprint(optimized)

Predict(AgentSignature(image, text -> output
    instructions="Take a deep breath. You are an agent that achieves the user's goal automatically. Please don't do anything the user doesn't want to do. Please be careful not to repeat the same action. It's better to achieve users' goals with the fewest number of actions."
    image = Field(annotation=str required=True json_schema_extra={'desc': 'Base64 encoded image of the current screen', '__dspy_field_type': 'input', 'prefix': 'Image:'})
    text = Field(annotation=str required=True json_schema_extra={'desc': "User's input text including goal and UI state", '__dspy_field_type': 'input', 'prefix': 'Text:'})
    output = Field(annotation=ActionSchema required=True json_schema_extra={'desc': 'Generated action', '__dspy_field_type': 'output', 'prefix': 'Output:'})
))
Predict(StringSignature(image, text -> output
    instructions="As an intuitive agent, your primary mission is to seamlessly empower users toward their goals while being acutely

In [29]:
train_examples = load_training_data(yml_path, jsonls_dir)
base_scores = []
for x in train_examples:
    pred = base_optimizer(**x.inputs())
    score = action_validator(x, pred)
    base_scores.append(score)

base_accuracy = base_scores.count(True) / len(base_scores)
print("Base Accuracy: ", base_accuracy)

copro_scores = []
for x in train_examples:
    pred = optimized(**x.inputs())
    score = action_validator(x, pred)
    copro_scores.append(score)

copro_scores = copro_scores.count(True) / len(copro_scores)
print("CORPO Accuracy: ", copro_scores)

raw output image_description='The Settings menu is displayed with options for Network & internet, Connected devices, Apps, and Notifications. There is a search bar at the top.' memo="Next, I will click on the 'Apps' option to continue navigating towards the 'About emulated device' page." action='ClickWithIndex' text='4'
predicted image_description: The Settings menu is displayed with options for Network & internet, Connected devices, Apps, and Notifications. There is a search bar at the top.
expected image_description: 
text_match:  False
expected:  1
predicted:  4
raw output image_description="The screen displays information about an emulated device, including the device name, Google account, and phone number. There is an option for 'Legal information' at the bottom." memo="Click on the 'Legal information' option to proceed." action='ClickWithIndex' text='3'
predicted image_description: The screen displays information about an emulated device, including the device name, Google account

In [None]:
lm.inspect_history(5)