In [1]:
# Finetune with Lora: https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md
# Training script: https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_task_lora.sh

# generate training data

# [
#   {
#     "id": "997bb945-628d-4724-b370-b84de974a19f",
#     "image": "part-000001/997bb945-628d-4724-b370-b84de974a19f.jpg",
#     "conversations": [
#       {
#         "from": "human",
#         "value": "<image>\nWrite a prompt for Stable Diffusion to generate this image."
#       },
#       {
#         "from": "gpt",
#         "value": "a beautiful painting of chernobyl by nekro, pascal blanche, john harris, greg rutkowski, sin jong hun, moebius, simon stalenhag. in style of cg art. ray tracing. cel shading. hyper detailed. realistic. ue 5. maya. octane render. "
#       },
#     ]
#   },
#   ...
# ]

In [1]:
from pathlib import Path
import json

annotation_dir = Path("/mnt/vol_c/ego4d_data/v1/annotations")
object_detection_path = Path("/mnt/vol_c/ego4d_data/v1/sta_models/object_detections.json")

def gen_conversation(annotation_path: Path, image_dir: Path, object_detection_path: Path=None) -> None:
    with open(annotation_path, "r") as f:
        # ['info', 'annotations', 'noun_categories', 'verb_categories']
        annotation = json.load(f)
    # with open(object_detection_path, "r") as f:
    #     object_detection = json.load(f)
    # print(object_detection.keys())

    conversation = []
    noun_categories = {
        pair["id"]: pair["name"] for pair in annotation["noun_categories"]
    }
    verb_categories = {
        pair["id"]: pair["name"] for pair in annotation["verb_categories"]
    }
    noun_categories = {
        k: v.split("_")[0] for k, v in noun_categories.items()
    }  # naive way to get the name (use the first noun before "_")
    verb_categories = {
        k: v.split("_")[0] for k, v in verb_categories.items()
    }  # naive way to get the name (use the first verb before "_")
    annotations = annotation["annotations"]
    ONE = "one"
    {
        "uid": "cde41c4f-50d1-4910-9f2a-4c7b6987df92_0000468",
        "video_id": "cde41c4f-50d1-4910-9f2a-4c7b6987df92",
        "frame": 468,
        "clip_id": 8,
        "clip_uid": "8d686451-cac9-4526-a022-b6eaf7d467b4",
        "clip_frame": 468,
        "objects": [
            {
                "box": [938.09, 1350.28, 1138.58, 1441.4099999999999],
                "verb_category_id": 62,
                "noun_category_id": 66,
                "time_to_contact": 0.9666666666666667,
            }
        ],
    }
    # p73 metric def
    for ann in annotations:
        gpt_value_str = json.dumps({
                "objName": noun_categories[ann['objects'][0]['noun_category_id']],
                "actType": verb_categories[ann['objects'][0]['verb_category_id']],
                "timeUntilContact": ann['objects'][0]['time_to_contact']
            })
        conversation.append(
            {
                "id": ann["uid"],
                "image": f"{image_dir}/{ann['uid']}.jpg",
                # "image": f"forecasting/short_term_anticipation/data/films/{ann['video_id']}/{ann['uid']}.png",
                "conversations": [
                    {
                        "from": "human",
                        "value":
                            "<image>\n"
                            "Anticipate the next object the hand will interact with, detailing the object's class, the verb describing the interaction, and the estimated time to contact. " # task
                            "Provide your response in the format 'n: <noun_category>, v: <verb_category>, bbox: [x_min, y_min, x_max, y_max], t: <time_to_contact>',\n" # format
                            f"where <noun_category> should be one of the noun in {list(noun_categories.values())},\n" # noun
                            f"<verb_category> should be one of the verb in {list(verb_categories.values())} and,\n" # verb
                            "<time_to_contact> is a float in second.", # ttc
                            # "This is a picture from a video taken from the view of a person performing some action. "
                            # "Please identify the objects in this image that the person is likely to interact with. For each object, "
                            # "predict three things: a noun, a verb, and a number in seconds. "
                            # "The noun is the object that the person is going to interact with."
                            # "The verb describes how the person will interact with the object. "
                            # "The number describes, in seconds, when the person will interact with the object. "
                            # f"Please predict {ONE} objects, and prioritize objects the person is immediately interacting with. "
                            # f"Please respond in JSON format, which contains a list of "
                            # "JSON dictionaries with fields 'objName', 'objDesc', 'actType', and 'timeUntilContact'. "
                            # "Some examples of such a JSON dictionary are: "
                            # "{'objName': 'mower', 'actType': 'put', 'timeUntilContact': 1.2}, "
                            # "{'objName': 'stone', 'actType': 'take', 'timeUntilContact': 0.5}, "
                            # "{'objName': 'pot', 'actType': 'clean', 'timeUntilContact': 0.4}, "
                            # "{'objName': 'dumbbell', 'actType': 'take', 'timeUntilContact': 1.3}, "
                            # "{'objName': 'scissors', 'actType': 'move', 'timeUntilContact': 0.2}."
                            # f"Please make sure that actionType, the verb, is one of these: {list(verb_categories.values())}. "
                            # f"Please make sure that objName, the noun, is one of these: {list(noun_categories.values())}. "
                    },
                    {
                        "from": "gpt",
                        "value": # gpt_value_str
                            f"n: {noun_categories[ann['objects'][0]['noun_category_id']]}, "
                            f"v: {verb_categories[ann['objects'][0]['verb_category_id']]}, "
                            f"bbox: {ann['objects'][0]['box']}, "
                            f"t: {ann['objects'][0]['time_to_contact']}",
                    },
                ],
            }
            
        )

    return conversation

conversation_json = gen_conversation(annotation_dir / "fho_sta_train.json", image_dir="IMAGE_DIR")
output_json_path = Path("/mnt/vol_c/forecasting/short_term_anticipation/prompts/prompts_v2.json")
# dump conversation_json to output_json_path
with open(output_json_path, "w") as f:
    json.dump(conversation_json, f, indent=2)