In [None]:
import pickle as pkl
import base64
from openai import OpenAI
import imageio.v3 as iio
from PIL import Image
import re
import imageio.v3 as iio
from IPython.display import display

In [None]:
# Initialize the OpenAI API
api_key = ''

client = OpenAI(
  api_key=api_key,  # this is also the default, it can be omitted
)

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [None]:
def get_task_specific(task_name):
    if task_name in ["object", "spatial"]:
        return "The first word of the instruction should be `PICK', and then it should also include the word `PLACE'."
    elif task_name == "goal":
        return "The first word of the instruction should remain the same."
    elif task_name == "10":
        return "Words like `PUT' and `PICK' should remain the same."
    else:
        raise ValueError(f"No task-specific rules defined for {task_name}")

def produce_alternate_prompts(task_name, version_number):
    """
    Produce alternate prompts for a given task_name and version_number.
    
    version_number = 1  => Original version of produce_alternate_prompts
    version_number = 2  => Includes task-specific bullet point (like produce_alternate_prompts_2)
    version_number = 3  => Includes task-specific bullet point and "Make changes as minor as possible" bullet point
                           (like produce_alternate_prompts_3)
    """
    # Choose the output file path based on version_number
    if version_number == 1:
        output_path = f"../data/{task_name}_alternate_instructions.pkl"
    elif version_number == 2:
        output_path = f"../data/{task_name}_alternate_instructions_2.pkl"
    elif version_number == 3:
        output_path = f"../data/{task_name}_alternate_instructions_3.pkl"
    else:
        raise ValueError("version_number must be 1, 2, or 3.")

    # If version_number requires task-specific instructions, retrieve them
    if version_number in [2, 3]:
        task_specific = get_task_specific(task_name)
        task_specific_line = f"- {task_specific}"
    else:
        task_specific_line = ""

    # If version_number=3, add the additional 'minor changes' bullet
    if version_number == 3:
        minor_changes_line = "- Make the changes as minor as possible, as the robot's language system is not very robust to rephrasing."
    else:
        minor_changes_line = ""

    # Load the existing task descriptions
    with open(f"../data/{task_name}_descriptions.pkl", "rb") as f:
        task_descriptions = pkl.load(f)

    # Container for all instructions
    all_instructions = {}

    # Loop over tasks
    for task_idx in range(10):
        task_description = task_descriptions[task_idx]
        print("TASK:", task_description, "| IDX:", task_idx)

        # Process description for the video file name
        processed_task_description = (
            task_description.lower()
            .replace(" ", "_")
            .replace("\n", "_")
            .replace(".", "_")[:50]
        )

        video_path = (
            f"/local/zemel/tom/code/vla_uq/openvla/rollouts/"
            f"libero_{task_name}--{task_idx}--episode=1--task={processed_task_description}.mp4"
        )

        # Read only the first frame from the video
        frame = iio.imread(video_path, index=0)
        image = Image.fromarray(frame)
        image_path = f"../data/image_grabs/{task_name}_{task_idx}_first_frame.png"
        image.save(image_path)
        display(image)

        # Build the prompt text
        prompt_text = f"""You are generating alternative phrasings of a robotic task instruction while preserving its exact meaning.

### Task Instruction:
'{task_description}'

### Instructions:
- Generate **20** alternative ways to phrase the task instruction.
- Keep each instruction **concise and unambiguous**.
- Ensure the instructions remain suitable for a **robot, not a human**.
- Only make **semantically meaningless** changes (e.g., word order, synonyms, slight rewording).
- Double-check that the new instructions mean the same exact thing for the robot; do not just substitute synonyms without considering context.
- Do **not** introduce additional steps, remove essential details, or alter the action.
{minor_changes_line}
{task_specific_line}

### Output Format:
Each rephrased instruction should be wrapped in `[instruction]` and `[/instruction]` tags, like this:
[instruction] Rephrased instruction 1 [/instruction] 
[instruction] Rephrased instruction 2 [/instruction]
"""

        print(prompt_text)

        # Convert image to base64 for sending to the model (example placeholder function)
        base64_image = encode_image(image_path)

        # Example placeholder API call
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                    ],
                }
            ],
        )

        # Retrieve the model output
        output_text = response.choices[0].message.content
        print(response.choices[0])

        # Extract text between [instruction] and [/instruction]
        instructions = re.findall(r"\[instruction\](.*?)\[/instruction\]", output_text, re.DOTALL)

        # Clean up each extracted instruction
        instructions = [instr.strip() for instr in instructions]

        # Print them out for reference
        for i, instruction in enumerate(instructions, 1):
            print(f"{i}: {instruction}")
        print("-"*20)

        # Store up to 20 instructions
        all_instructions[task_idx] = instructions[:20]
        # Ensure we have at least 20
        assert len(instructions) >= 20 and len(all_instructions[task_idx]) >= 20

    # Save the instructions to a file
    with open(output_path, "wb") as f:
        pkl.dump(all_instructions, f)
    print(f"Saved results to {output_path}")


In [None]:
def get_task_specific(task_name):

    if task_name in ["object", "spatial"]:

        return "The first word of the instruction should be `PICK', and then it should also include the word `PLACE'."
    
    elif task_name == "goal":

        return "The first word of the instruction should remain the same."
    
    elif task_name == "10":

        return "Words like `PUT' and `PICK' should remain the same."
    
    else:
        raise ValueError
    

def produce_alternate_prompts(task_name):
    with open(f"../data/{task_name}_descriptions.pkl", "rb") as f:
        task_descriptions = pkl.load(f)

    all_instructions = dict()

    for task_idx in range(10):

        task_description = task_descriptions[task_idx]
        print("TASK:", task_description, "| IDX:", task_idx)

        # Read the first frame
        processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
        video_path = f"/local/zemel/tom/code/vla_uq/openvla/rollouts/libero_{task_name}--{task_idx}--episode=1--task={processed_task_description}.mp4"
        frame = iio.imread(video_path, index=0)  # Read only the first frame

        # Convert to a PIL image and save as PNG
        image = Image.fromarray(frame)
        image_path = f"../data/image_grabs/{task_name}_{task_idx}_first_frame.png"
        image.save(image_path)
        display(image)

        prompt_text = f"""You are generating alternative phrasings of a robotic task instruction while preserving its exact meaning. 

        ### Task Instruction:
        '{task_description}'

        ### Instructions:
        - Generate **20** alternative ways to phrase the task instruction.
        - Keep each instruction **concise and unambiguous**.
        - Ensure the instructions remain suitable for a **robot, not a human**.
        - Only make **semantically meaningless** changes (e.g., word order, synonyms, slight rewording).
        - Double-check that the new instructions mean the same exact thing for the robot, do not just substitute synonyms without considering context.
        - Do **not** introduce additional steps, remove essential details, or alter the action.

        ### Output Format:
        Each rephrased instruction should be wrapped in `[instruction]` and `[/instruction]` tags, like this:
        [instruction] Rephrased instruction 1 [/instruction] 
        [instruction] Rephrased instruction 2 [/instruction] 
        """

        print(prompt_text)

        # Getting the Base64 string
        base64_image = encode_image(image_path)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt_text,
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                        },
                    ],
                }
            ],
        )

        print(response.choices[0])

        output_text = response.choices[0].message.content

        # Extract text between [instruction] and [/instruction] using regex
        instructions = re.findall(r"\[instruction\](.*?)\[/instruction\]", output_text, re.DOTALL)

        # Remove any extra spaces
        instructions = [instr.strip() for instr in instructions]

        # Print the parsed list
        for i, instruction in enumerate(instructions, 1):
            print(f"{i}: {instruction}")

        print("-"*20)

        

        all_instructions[task_idx] = instructions[:20]

        assert (len(instructions) >= 20) and (len(all_instructions[task_idx]) >= 20)


    with open(f"../data/{task_name}_alternate_instructions.pkl", "wb") as f:
        pkl.dump(all_instructions, f)
        


def produce_alternate_prompts_2(task_name):
    with open(f"../data/{task_name}_descriptions.pkl", "rb") as f:
        task_descriptions = pkl.load(f)

    all_instructions = dict()
    task_specific = get_task_specific(task_name)

    for task_idx in range(10):

        task_description = task_descriptions[task_idx]
        print("TASK:", task_description, "| IDX:", task_idx)

        # Read the first frame
        processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
        video_path = f"/local/zemel/tom/code/vla_uq/openvla/rollouts/libero_{task_name}--{task_idx}--episode=1--task={processed_task_description}.mp4"
        frame = iio.imread(video_path, index=0)  # Read only the first frame

        # Convert to a PIL image and save as PNG
        image = Image.fromarray(frame)
        image_path = f"../data/image_grabs/{task_name}_{task_idx}_first_frame.png"
        image.save(image_path)
        display(image)

        prompt_text = f"""You are generating alternative phrasings of a robotic task instruction while preserving its exact meaning. 

        ### Task Instruction:
        '{task_description}'

        ### Instructions:
        - Generate **20** alternative ways to phrase the task instruction.
        - Keep each instruction **concise and unambiguous**.
        - Ensure the instructions remain suitable for a **robot, not a human**.
        - Only make **semantically meaningless** changes (e.g., word order, synonyms, slight rewording).
        - Double-check that the new instructions mean the same exact thing for the robot, do not just substitute synonyms without considering context.
        - Do **not** introduce additional steps, remove essential details, or alter the action.
        - {task_specific}

        ### Output Format:
        Each rephrased instruction should be wrapped in `[instruction]` and `[/instruction]` tags, like this:
        [instruction] Rephrased instruction 1 [/instruction] 
        [instruction] Rephrased instruction 2 [/instruction] 
        """

        print(prompt_text)

        # Getting the Base64 string
        base64_image = encode_image(image_path)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt_text,
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                        },
                    ],
                }
            ],
        )

        print(response.choices[0])

        output_text = response.choices[0].message.content

        # Extract text between [instruction] and [/instruction] using regex
        instructions = re.findall(r"\[instruction\](.*?)\[/instruction\]", output_text, re.DOTALL)

        # Remove any extra spaces
        instructions = [instr.strip() for instr in instructions]

        # Print the parsed list
        for i, instruction in enumerate(instructions, 1):
            print(f"{i}: {instruction}")

        print("-"*20)

        

        all_instructions[task_idx] = instructions[:20]

        assert (len(instructions) >= 20) and (len(all_instructions[task_idx]) >= 20)


    with open(f"../data/{task_name}_alternate_instructions_2.pkl", "wb") as f:
        pkl.dump(all_instructions, f)


def produce_alternate_prompts_3(task_name):
    with open(f"../data/{task_name}_descriptions.pkl", "rb") as f:
        task_descriptions = pkl.load(f)

    all_instructions = dict()
    task_specific = get_task_specific(task_name)

    for task_idx in range(10):

        task_description = task_descriptions[task_idx]
        print("TASK:", task_description, "| IDX:", task_idx)

        # Read the first frame
        processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
        video_path = f"/local/zemel/tom/code/vla_uq/openvla/rollouts/libero_{task_name}--{task_idx}--episode=1--task={processed_task_description}.mp4"
        frame = iio.imread(video_path, index=0)  # Read only the first frame

        # Convert to a PIL image and save as PNG
        image = Image.fromarray(frame)
        image_path = f"../data/image_grabs/{task_name}_{task_idx}_first_frame.png"
        image.save(image_path)
        display(image)

        prompt_text = f"""You are generating alternative phrasings of a robotic task instruction while preserving its exact meaning. 

        ### Task Instruction:
        '{task_description}'

        ### Instructions:
        - Generate **20** alternative ways to phrase the task instruction.
        - Make the changes as minor as possible, as the robot's language system is not very robust to rephrasing.
        - Keep each instruction **concise and unambiguous**.
        - Ensure the instructions remain suitable for a **robot, not a human**.
        - Only make **semantically meaningless** changes (e.g., word order, synonyms, slight rewording).
        - Double-check that the new instructions mean the same exact thing for the robot, do not just substitute synonyms without considering context.
        - Do **not** introduce additional steps, remove essential details, or alter the action.
        - {task_specific}

        ### Output Format:
        Each rephrased instruction should be wrapped in `[instruction]` and `[/instruction]` tags, like this:
        [instruction] Rephrased instruction 1 [/instruction] 
        [instruction] Rephrased instruction 2 [/instruction] 
        """

        print(prompt_text)

        # Getting the Base64 string
        base64_image = encode_image(image_path)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt_text,
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                        },
                    ],
                }
            ],
        )

        print(response.choices[0])

        output_text = response.choices[0].message.content

        # Extract text between [instruction] and [/instruction] using regex
        instructions = re.findall(r"\[instruction\](.*?)\[/instruction\]", output_text, re.DOTALL)

        # Remove any extra spaces
        instructions = [instr.strip() for instr in instructions]

        # Print the parsed list
        for i, instruction in enumerate(instructions, 1):
            print(f"{i}: {instruction}")

        print("-"*20)

        

        all_instructions[task_idx] = instructions[:20]

        assert (len(instructions) >= 20) and (len(all_instructions[task_idx]) >= 20)


    with open(f"../data/{task_name}_alternate_instructions_3.pkl", "wb") as f:
        pkl.dump(all_instructions, f)

In [None]:
for suite in [
    "spatial", "object", "goal"
]:
    produce_alternate_prompts(suite)
    produce_alternate_prompts_2(suite)
    produce_alternate_prompts_3(suite)