In [None]:
import os
import json

def generate_vlm_prompts(annotation_dir):
    """
    Reads annotation JSON files and generates prompts for a Vision-Language Model (VLM).

    Args:
        annotation_dir (str): Directory containing annotation JSON files.
        
    Returns:
        dict: A dictionary where keys are filenames and values are generated prompts.
    """
    prompts = {}

    for filename in os.listdir(annotation_dir):
        if filename.endswith('.json'):
            file_path = os.path.join(annotation_dir, filename)

            with open(file_path, 'r') as file:
                data = json.load(file)

                # Extract relevant information
                detections = data.get('detections', [])
                image_path = data.get('image_path', 'unknown image path')
                robot_pose = data.get('robot_pose', {})

                # Handle cases where robot_pose is None
                if robot_pose is None:
                    translation = {"x": 0, "y": 0, "z": 0}
                    orientation = {"x": 0, "y": 0, "z": 0, "w": 0}
                else:
                    translation = robot_pose.get('translation', {"x": 0, "y": 0, "z": 0})
                    orientation = robot_pose.get('orientation', {"x": 0, "y": 0, "z": 0, "w": 0})

                # Generate prompt parts for detections
                prompt_parts = []
                for detection in detections:
                    obj_class = detection.get('class', 'unknown object')
                    confidence = detection.get('confidence', 0)
                    bbox = detection.get('bbox', {})
                    center_x = bbox.get('center_x', 0)
                    center_y = bbox.get('center_y', 0)
                    size_x = bbox.get('size_x', 0)
                    size_y = bbox.get('size_y', 0)
                    median_depth = detection.get('median_depth', 'unknown depth')
                    feature_vector = detection.get('feature_vector', [])
                    feature_vector_str = ', '.join(f"{v:.2f}" for v in feature_vector[:5]) + "..." if feature_vector else "no features"
                    prompt_parts.append(
                        f"a {obj_class} with confidence {confidence:.2f} "
                        f"at center ({center_x:.2f}, {center_y:.2f}) "
                        f"and size ({size_x:.2f}, {size_y:.2f}),"
                        f"median depth {median_depth}, "
                        f"features [{feature_vector_str}]"
                    )

                # Combine all parts into a single prompt
                detection_description = ", ".join(prompt_parts) if prompt_parts else "no objects detected"
                robot_pose_description = (
                    f"Robot is at translation ({translation.get('x', 0):.2f}, "
                    f"{translation.get('y', 0):.2f}, {translation.get('z', 0):.2f}) "
                    f"and orientation ({orientation.get('x', 0):.2f}, {orientation.get('y', 0):.2f}, "
                    f"{orientation.get('z', 0):.2f}, {orientation.get('w', 0):.2f})."
                )
                prompt = (
                    f"In the image at {image_path}, {detection_description}. "
                    f"{robot_pose_description}"
                )
                prompts[filename] = prompt

    return prompts


annotation_directory = "/Users/esort/Downloads/annotations_with_features_and_depth"
vlm_prompts = generate_vlm_prompts(annotation_directory)

for file, prompt in vlm_prompts.items():
    print(f"{file}: {prompt}")

20250416_120444_324.json: In the image at /ros2_ws/data_collection/20250416_114303/images/20250416_120444_324.jpg, a person with confidence 0.60 at center (1896.36, 432.28) and size (46.55, 322.13),median depth 7.0, features [0.16, -0.19, -0.03, -0.26, 0.20...]. Robot is at translation (6.78, -0.05, 0.01) and orientation (0.00, 0.00, 0.30, 0.95).
20250416_115221_557.json: In the image at /ros2_ws/data_collection/20250416_114303/images/20250416_115221_557.jpg, a chair with confidence 0.93 at center (349.60, 487.05) and size (116.68, 156.97),median depth 13.0, features [-0.01, 0.13, 0.24, -0.26, 0.63...], a chair with confidence 0.60 at center (597.80, 494.42) and size (82.91, 149.13),median depth 13.0, features [0.31, -0.13, 0.28, -0.30, 0.55...], a dining table with confidence 0.52 at center (464.47, 495.47) and size (153.09, 151.81),median depth 13.0, features [0.12, 0.12, 0.32, 0.14, 0.01...]. Robot is at translation (2.53, 0.61, 0.01) and orientation (0.00, 0.00, -0.24, 0.97).
20250

In [2]:
def save_prompts_to_json(prompts, output_file):
    """
    Saves the generated prompts to a JSON file.

    Args:
        prompts (dict): A dictionary where keys are filenames and values are generated prompts.
        output_file (str): Path to the output JSON file.
    """
    with open(output_file, 'w') as file:
        json.dump(prompts, file, indent=4)

In [3]:
output_file = "/Users/esort/Documents/s24/DL/generated_prompts.json"
save_prompts_to_json(vlm_prompts, output_file)