## Import the data

Create the datapoint

In [None]:
from datasets import load_dataset

ds = load_dataset("osunlp/Multimodal-Mind2Web")

In [26]:
train = ds.get("train")

train_iterator = iter(train)

print(list(train[0].keys()))

import dataclasses
from dataclasses import dataclass
from typing import List, Literal, Tuple
import json

@dataclass
class Coordinate:
    x: int
    y: int

@dataclass
class ScrollBar:
    offset: float
    height: float

@dataclass
class BrowserState:
    screenshot: str
    height: int
    width: int
    scrollbar: ScrollBar
    url: str
    mouse: Coordinate

@dataclass
class BrowserAction:
    action: Literal[
        "success",
        "failure",
        "key",
        "type",
        "mouse_move",
        "left_click",
        "left_click_drag",
        "right_click",
        "middle_click",
        "double_click",
        "screenshot",
        "cursor_position",
        "scroll_up",
        "scroll_down",
    ]
    # TODO: Do we want to use Coordinate class here, or easier to just construct with tuple
    coordinate: tuple[int, int] | None
    text: str | None
    reasoning: str
    id: str


@dataclass
class BrowserStep:
    state: BrowserState
    action: BrowserAction

import random

def generate_tool_id() -> str:
    prefix = 'toolu_01'
    characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
    id_length = 22
    result = prefix

    for _ in range(id_length):
        result += random.choice(characters)

    return result


def is_in_viewport(viewport, point):
    x1, y1, x2, y2 = viewport
    x, y = point
    return x1 <= x <= x2 and y1 <= y <= y2

def scroll_viewport(direction, viewport, y_max):
    x1, y1, x2, y2 = viewport
    height = y2 - y1
    scroll_amount = 0.75 * height

    if direction == "up":
        new_y1 = max(1, y1 - scroll_amount)
        new_y2 = new_y1 + height
    elif direction == "down":
        new_y2 = min(y_max, y2 + scroll_amount)
        new_y1 = new_y2 - height
    else:
        raise ValueError("Direction must be 'up' or 'down'")

    # Adjust if the new viewport exceeds bounds while preserving height
    if new_y1 < 1:
        new_y1 = 1
        new_y2 = new_y1 + height
    if new_y2 > y_max:
        new_y2 = y_max
        new_y1 = new_y2 - height

    return (x1, new_y1, x2, new_y2)

def viewport_screenshot(screenshot, viewport):
    import base64
    from io import BytesIO

    x1, y1, x2, y2 = map(int, viewport)
    cropped_image = screenshot.copy().crop((x1, y1, x2, y2))
    
    buffered = BytesIO()
    cropped_image.save(buffered, format="JPEG", quality=85)
    encoded_string = base64.b64encode(buffered.getvalue()).decode("utf-8")
    
    return encoded_string
    # return ""


def process_step(step, mouse_coordinates: Coordinate) -> Tuple[List[BrowserStep], Coordinate]:
    cerebellum_steps: List[BrowserStep]  = []

    # Initialize the viewport to the top 16:10 ratio part of the screenshot
    screenshot = step["screenshot"]
    width, height = screenshot.size
    viewport_height = width * 10 / 16
    viewport = (0, 0, width, viewport_height)

    # Find the bounding box of the first pos_candidates
    if len(step["pos_candidates"]) == 0:
        return [];
    candidate = json.loads(step["pos_candidates"][0])
    attributes = json.loads(candidate["attributes"])
    bounding_box_rect = attributes["bounding_box_rect"]
    x, y, box_width, box_height = map(float, bounding_box_rect.split(','))
    center_x = x + box_width / 2
    center_y = y + box_height / 2

    # Scroll the viewport until the center of the bounding box is in view
    y_max = float(height)
    while not is_in_viewport(viewport, (center_x, center_y)):
        if center_y < viewport[1]:
            browser_state = BrowserState(
                url='',
                screenshot=viewport_screenshot(screenshot, viewport),
                height=viewport_height,
                width=width,
                scrollbar=ScrollBar(offset= float(viewport[1])/y_max, height=float(viewport_height)/y_max),
                mouse=mouse_coordinates
            )
            page_up_action = BrowserAction(
                action="key",
                coordinate=None,
                text="PAGE_UP",
                reasoning="Press the Page Up key to scroll up",
                id=generate_tool_id()
            )
            cerebellum_steps.append(BrowserStep(state=browser_state, action=page_up_action))

            viewport = scroll_viewport("up", viewport, y_max)
        elif center_y > viewport[3]:

            browser_state = BrowserState(
                url='',
                screenshot=viewport_screenshot(screenshot, viewport),
                height=viewport_height,
                width=width,
                scrollbar=ScrollBar(offset= float(viewport[1])/y_max, height=float(viewport_height)/y_max),
                mouse=mouse_coordinates
            )
            page_down_action = BrowserAction(
                action="key",
                coordinate=None,
                text="PAGE_DOWN",
                reasoning="Press the Page Down key to scroll down",
                id=generate_tool_id()
            )
            cerebellum_steps.append(BrowserStep(state=browser_state, action=page_down_action))
            viewport = scroll_viewport("down", viewport, y_max)

        

    # Create a mouse movement action to position the mouse into the center of the bounding box
    # Remap center_x and center_y relative to the current viewport
    center_x_relative = center_x - viewport[0]
    center_y_relative = center_y - viewport[1]
    mouse_move_action = BrowserAction(
        action="move",
        coordinate=(center_x_relative, center_y_relative),
        text=None,
        reasoning="Move mouse to the center of the element",
        id=generate_tool_id()
    )
    browser_state = BrowserState(
            url='',
            screenshot=viewport_screenshot(screenshot, viewport),
            height=viewport_height,
            width=width,
            scrollbar=ScrollBar(offset= float(viewport[1])/y_max, height=float(viewport_height)/y_max),
            mouse=mouse_coordinates
        )
    move_step = BrowserStep(state=browser_state, action=mouse_move_action)
    cerebellum_steps.append(move_step)

    # Pretend now the mouse was moved
    mouse_coordinates = Coordinate(x=center_x_relative, y=center_y_relative)

    # Perform a left click action
    left_click_action = BrowserAction(
        action="left_click",
        coordinate=None,
        text=None,
        reasoning="Perform a left click on element",
        id=generate_tool_id()
    )
    browser_state = BrowserState(
            url='',
            screenshot=viewport_screenshot(screenshot, viewport),
            height=viewport_height,
            width=width,
            scrollbar=ScrollBar(offset= float(viewport[1])/y_max, height=float(viewport_height)/y_max),
            mouse=mouse_coordinates
        )
    left_click_step = BrowserStep(state=browser_state, action=left_click_action)
    cerebellum_steps.append(left_click_step)

    # Create corresponding key actions if the action is "type" or "select"
    operation = json.loads(step["operation"])
    if operation["op"] in ["TYPE", "SELECT"]:
        text = operation["value"]
        type_action = BrowserAction(
            action='type',
            coordinate=None,
            text=text,
            reasoning=f"Typing text set to desired value",
            id=generate_tool_id()
        )
        browser_state = BrowserState(
            url='',
            screenshot=viewport_screenshot(screenshot, viewport),
            height=viewport_height,
            width=width,
            scrollbar=ScrollBar(offset= float(viewport[1])/y_max, height=float(viewport_height)/y_max),
            mouse=mouse_coordinates
        )
        type_step = BrowserStep(state=browser_state, action=type_action)
        cerebellum_steps.append(type_step)

    # Return an array of BrowserStep[]
    return cerebellum_steps

data_point = next(train_iterator)
while train_iterator is not None:
    
    goal = data_point["confirmed_task"]
    task_id = data_point["annotation_id"]

    print('Grabbing steps for:', goal)

    steps = [data_point]

    # Keep on pulling on the iterator until we get all the steps in this task
    while True:

        data_point = next(train_iterator)

        if data_point["annotation_id"] != task_id:
            break;
        
        steps.append(data_point)

    cerebellum_steps: List[BrowserStep] = []

    for raw_step in steps:
        
        decomposed_steps = process_step(raw_step, Coordinate(x=1, y=1))

        cerebellum_steps += decomposed_steps

    # Define the output file path
    output_file_path = f'mind2web/{task_id}.jsonl'

    # Open the file in write mode
    with open(output_file_path, 'w') as outfile:
        goal_json = json.dumps({"goal": goal})
        outfile.write(goal_json)
        outfile.write('\n')
        # Iterate over each step in cerebellum_steps
        for this_step in cerebellum_steps:
            # Write the dictionary as a JSON line
            step_str = json.dumps(dataclasses.asdict(this_step))
            outfile.write(step_str)
            outfile.write('\n')

['action_uid', 'raw_html', 'cleaned_html', 'operation', 'pos_candidates', 'neg_candidates', 'website', 'domain', 'subdomain', 'annotation_id', 'confirmed_task', 'screenshot', 'action_reprs', 'target_action_index', 'target_action_reprs']
Grabbing steps for: rent a car in Brooklyn - Central, NY on from April 9 to April 15.
Grabbing steps for: Show computer game reviews sorted by score.
Grabbing steps for: Find the address and store hours for the Armageddon Shop record store in Boston.
Grabbing steps for: Buy a copy of the Gorillaz first studio album.
Grabbing steps for: Buy a pop rock album CD from the United Kingdom that was released in 2016, is between £15 and £20 and in perfect condition.
Grabbing steps for: Find a full-time job in Budget USA in finance in any location, and apply to the latest job.
Grabbing steps for: What is the cheapest luxury car to pickup on the second closest nearby location to New York, United States, 100
Grabbing steps for: Book the cheapest long-term car renta

KeyboardInterrupt: 