# egoplan preprocess

In [18]:

import json
egoplan_annotations = json.load(open('./EgoPlan-Bench2.json'))
ego4d_metadata = json.load(open('/mnt/extra/dataset/ego4d/ego4d.json'))['videos']

egoplan_annotations_processed = []
for sample in egoplan_annotations:
    video_id = sample['sample_id'].split('_')[0]
    for video in ego4d_metadata:
        if video_id == video['video_uid']:
            break
    fps = video['video_metadata']['fps']
    sample['fps'] = fps
    sample['video_uid'] = video_id
    egoplan_annotations_processed.append(sample)

json.dump(egoplan_annotations_processed, open('./EgoPlan-Bench2-processed.json', 'w'), indent=4)



In [3]:
import json
egoplan_annotations_processed = json.load(open('./EgoPlan-Bench2-processed.json'))

In [4]:
import random
print(random.random())

0.3632605018306919


# transform

In [5]:

import random
import numpy as np

instructions_mode = [
    {"role": "user", "content": "Can you guide me step by step on how to achieve my goal, such as {}?"},
    {"role": "user", "content": "What are the steps I need to follow to successfully accomplish my goal, like {}?"},
    {"role": "user", "content": "Could you break down the process of achieving my goal, such as {}, into simple steps?"},
    {"role": "user", "content": "What should I do first, second, and so on to accomplish my goal, like {}?"},
    {"role": "user", "content": "Please provide me with a detailed step-by-step guide to achieving my goal, such as {}."},
    {"role": "user", "content": "How can I achieve my goal, such as {}, step by step? Can you explain each stage?"},
    {"role": "user", "content": "Can you walk me through the entire process of achieving my goal, such as {}, one step at a time?"},
    {"role": "user", "content": "What are the sequential actions I need to take to accomplish my goal, like {}?"},
    {"role": "user", "content": "Could you assist me by outlining the step-by-step instructions for achieving my goal, such as {}?"},
    {"role": "user", "content": "Please explain how to achieve my goal {}, starting from the very beginning and detailing each step."},
]

next_instructions = [
    {"role": "user", "content": "What are the next steps to {} after this? Please explain in detail."},
    {"role": "user", "content": "Can you guide me on what to do next to {}, including specific actions?"},
    {"role": "user", "content": "What should I do next to continue doing {}? Please provide clear instructions."},
    {"role": "user", "content": "Could you explain the following steps for {} with all necessary details?"},
    {"role": "user", "content": "What comes after this step in the process of {}? Please describe thoroughly."},
    {"role": "user", "content": "Please tell me the next actions required to {}, step by step."},
    {"role": "user", "content": "What is the next thing I need to do to {}? Include any important details."},
    {"role": "user", "content": "Can you outline the subsequent steps for bread preparation with detailed guidance?"},
    {"role": "user", "content": "What should be done next to complete the process of {}? Please elaborate."},
    {"role": "user", "content": "What are the following steps for {} successfully? Provide specific actions and details."},
]


def sample2planqa(sample, mode_ratio=0.5, new_fps=2):
    
    fps = sample['fps']
    conversation = {
        'video_uid': sample['sample_id'].split('_')[0],
        'conversation': [],
        'duration': sample['current_observation_frame'] / fps - sample['task_start_frame'] / fps
    }
    
    if random.random() < mode_ratio: # instruction mode
        
        time = sample['task_start_frame'] / fps
        wait_time = min(time, 1/new_fps)
        start_time = time - wait_time
        
        conversation['conversation'].append({
            'role': 'user',
            'content': random.choice(instructions_mode)['content'].format(sample['task_goal']),
            'time': start_time,
        }) # user asks for instructions
        
        for action in sample['task_progress_metadata']: # generate assistant's response
            conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}.'.format(action['narration_text']),
            'time': time,
            })
            time = action['stop_frame'] / fps
        
        conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}'.format(sample['answer']),
            'time': time,
        })
        
    else:
        # select a random action, then ask for next steps
        action_number = len(sample['task_progress_metadata'])
        insert_index = random.randint(0, action_number-1-1) # insert the question before the last action
        time = sample['task_progress_metadata'][insert_index+1]['start_frame'] / fps
        last_time = sample['task_progress_metadata'][insert_index]['stop_frame'] / fps 
        wait_time = random.random() * (time - last_time)
        start_time = time - wait_time
        
        conversation['conversation'].append({
            'role': 'user',
            'content': random.choice(next_instructions)['content'].format(sample['task_goal']),
            'time': start_time,
        })
        
        # recored the previous actions
        preview_actions = []
        for i, action in enumerate(sample['task_progress_metadata']):
            if i <= insert_index:
                preview_actions.append(action['narration_text'])
        
        conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You have done {}.'.format(', '.join(preview_actions)) + ' Next, you need to {}.'.format(sample['task_progress_metadata'][insert_index+1]['narration_text']),
            'time': time,
        })
        time = sample['task_progress_metadata'][insert_index+1]['stop_frame'] / fps
        
        # recored the following actions
        for action in sample['task_progress_metadata'][insert_index+2:]:
            conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}.'.format(action['narration_text']),
            'time': time,
            })
            time = action['stop_frame'] / fps
        
        # recored the last action
        conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}'.format(sample['answer']),
            'time': time,
        })
    
    return conversation

print(json.dumps(sample2planqa(egoplan_annotations_processed[0], 0.5), indent=4))

{
    "video_id": "0d270946-95c5-4e71-ae49-b9e802548147",
    "conversation": [
        {
            "role": "user",
            "content": "What are the sequential actions I need to take to accomplish my goal, like prepare bread?",
            "time": 778.9333333333333
        },
        {
            "role": "assistant",
            "content": "You need to scoop dough.",
            "time": 779.4333333333333
        },
        {
            "role": "assistant",
            "content": "You need to scoop dough.",
            "time": 780.7
        },
        {
            "role": "assistant",
            "content": "You need to knead dough.",
            "time": 781.8
        },
        {
            "role": "assistant",
            "content": "You need to place dough on plate.",
            "time": 785.0333333333333
        },
        {
            "role": "assistant",
            "content": "You need to add oil.",
            "time": 789.6333333333333
        },
        {
           

# goal step

In [2]:
import json, os

def get_narrations(sources):
    annos = []
    for source in sources:
        if source['segments']:
            annos.append({
                'video_uid': source['video_uid'],
                'summary': (source['start_time'], source['end_time'], source['goal_description'].strip()),
                'narrations': [(segment['start_time'], segment['end_time'], segment['step_description'].strip()) for segment in source['segments']],
            })
        for segment in source['segments']:
            if segment['segments']:
                annos.append({
                    'video_uid': source['video_uid'],
                    'summary': (segment['start_time'], segment['end_time'], segment['step_description'].strip()),
                    'narrations': [(seg['start_time'], seg['end_time'], seg['step_description'].strip()) for seg in segment['segments']],
                })
    return annos
EGO4D_ANNO_ROOT = '/mnt/extra/dataset/ego4d/v2/annotations/'
sources = json.load(open(os.path.join(EGO4D_ANNO_ROOT,'goalstep_train.json')))['videos']
annos = get_narrations(sources)
sources = json.load(open(os.path.join(EGO4D_ANNO_ROOT,'goalstep_val.json')))['videos']
annos = annos + get_narrations(sources)
print(len(annos))

5647


In [4]:
goalstep_annotation_processed = []
for anno in annos:
    processed_anno = {
        'sample_id': anno['video_uid'],
        'task_goal': anno['summary'][2],
        'task_start_frame': anno['summary'][0], # it is time
        'current_observation_frame': anno['summary'][1], # it is time
    }
    processed_anno['task_progress_metadata'] = []
    for nar in anno['narrations']:
        processed_anno['task_progress_metadata'].append({
            'start_frame': nar[0],
            'stop_frame': nar[1],
            'narration_text': nar[2],
        })
    goalstep_annotation_processed.append(processed_anno)

json.dump(goalstep_annotation_processed, open('./goalstep_annotation_processed.json', 'w'), indent=4)

In [6]:


import random
import numpy as np

instructions_mode = [
    {"role": "user", "content": "Can you guide me step by step on how to achieve my goal, such as {}?"},
    {"role": "user", "content": "What are the steps I need to follow to successfully accomplish my goal, like {}?"},
    {"role": "user", "content": "Could you break down the process of achieving my goal, such as {}, into simple steps?"},
    {"role": "user", "content": "What should I do first, second, and so on to accomplish my goal, like {}?"},
    {"role": "user", "content": "Please provide me with a detailed step-by-step guide to achieving my goal, such as {}."},
    {"role": "user", "content": "How can I achieve my goal, such as {}, step by step? Can you explain each stage?"},
    {"role": "user", "content": "Can you walk me through the entire process of achieving my goal, such as {}, one step at a time?"},
    {"role": "user", "content": "What are the sequential actions I need to take to accomplish my goal, like {}?"},
    {"role": "user", "content": "Could you assist me by outlining the step-by-step instructions for achieving my goal, such as {}?"},
    {"role": "user", "content": "Please explain how to achieve my goal {}, starting from the very beginning and detailing each step."},
]

next_instructions = [
    {"role": "user", "content": "What are the next steps to {} after this? Please explain in detail."},
    {"role": "user", "content": "Can you guide me on what to do next to {}, including specific actions?"},
    {"role": "user", "content": "What should I do next to continue doing {}? Please provide clear instructions."},
    {"role": "user", "content": "Could you explain the following steps for {} with all necessary details?"},
    {"role": "user", "content": "What comes after this step in the process of {}? Please describe thoroughly."},
    {"role": "user", "content": "Please tell me the next actions required to {}, step by step."},
    {"role": "user", "content": "What is the next thing I need to do to {}? Include any important details."},
    {"role": "user", "content": "Can you outline the subsequent steps for bread preparation with detailed guidance?"},
    {"role": "user", "content": "What should be done next to complete the process of {}? Please elaborate."},
    {"role": "user", "content": "What are the following steps for {} successfully? Provide specific actions and details."},
]


def sample2planqa(sample, mode_ratio=0.5, new_fps=2):
    
    fps = 1
    conversation = {
        'video_uid': sample['sample_id'].split('_')[0],
        'conversation': [],
        'duration': sample['current_observation_frame'] / fps - sample['task_start_frame'] / fps,
        'start_time': sample['task_start_frame'] / fps,
        'end_time': sample['current_observation_frame'] / fps,
    }
    
    if random.random() < mode_ratio: # instruction mode
        
        time = sample['task_start_frame'] / fps
        wait_time = min(time, 1/new_fps)
        start_time = time - wait_time
        
        conversation['conversation'].append({
            'role': 'user',
            'content': random.choice(instructions_mode)['content'].format(sample['task_goal']),
            'time': start_time,
        }) # user asks for instructions
        
        for action in sample['task_progress_metadata']: # generate assistant's response
            conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}.'.format(action['narration_text']),
            'time': time,
            })
            time = action['stop_frame'] / fps
        
        # conversation['conversation'].append({
        #     'role': 'assistant',
        #     'content': 'You need to {}'.format(sample['answer']),
        #     'time': time,
        # })
        
    else:
        # select a random action, then ask for next steps
        action_number = len(sample['task_progress_metadata'])
        insert_index = random.randint(0, action_number-1-1) # insert the question before the last action
        time = sample['task_progress_metadata'][insert_index+1]['start_frame'] / fps
        last_time = sample['task_progress_metadata'][insert_index]['stop_frame'] / fps 
        wait_time = random.random() * (time - last_time)
        start_time = time - wait_time
        
        conversation['conversation'].append({
            'role': 'user',
            'content': random.choice(next_instructions)['content'].format(sample['task_goal']),
            'time': start_time,
        })
        
        # recored the previous actions
        preview_actions = []
        for i, action in enumerate(sample['task_progress_metadata']):
            if i <= insert_index:
                preview_actions.append(action['narration_text'])
        
        conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You have done {}.'.format(', '.join(preview_actions)) + ' Next, you need to {}.'.format(sample['task_progress_metadata'][insert_index+1]['narration_text']),
            'time': time,
        })
        time = sample['task_progress_metadata'][insert_index+1]['stop_frame'] / fps
        
        # recored the following actions
        for action in sample['task_progress_metadata'][insert_index+2:]:
            conversation['conversation'].append({
            'role': 'assistant',
            'content': 'You need to {}.'.format(action['narration_text']),
            'time': time,
            })
            time = action['stop_frame'] / fps
        
        # recored the last action
        # conversation['conversation'].append({
        #     'role': 'assistant',
        #     'content': 'You need to {}'.format(sample['answer']),
        #     'time': time,
        # })
    
    return conversation

print(json.dumps(sample2planqa(goalstep_annotation_processed[0], 0.5), indent=4))

{
    "video_uid": "39d087b0-afc2-47d8-ba91-b70dd8fab90e",
    "conversation": [
        {
            "role": "user",
            "content": "Could you break down the process of achieving my goal, such as Making rice, dough and vegetable dish, into simple steps?",
            "time": 0.0
        },
        {
            "role": "assistant",
            "content": "You need to preheat pots.",
            "time": 0.0210286458333333
        },
        {
            "role": "assistant",
            "content": "You need to add rice to boiling water.",
            "time": 1298.594
        },
        {
            "role": "assistant",
            "content": "You need to cover pot of rice.",
            "time": 1327.0985
        },
        {
            "role": "assistant",
            "content": "You need to add oil to empty pot.",
            "time": 1383.41099
        },
        {
            "role": "assistant",
            "content": "You need to add wood to fire.",
            "time": 1

## refine

In [7]:
print(len(goalstep_annotation_processed))

5647


In [20]:

import json
from openai import OpenAI

def get_llm_reponse_json(system_prompt, user_prompt):
    client = OpenAI(
            api_key="sk-43a08cfb3ae64b6288ee67db8009c8ca",
            base_url="https://api.deepseek.com",
        )

    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}]

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        response_format={
            'type': 'json_object'
        }
    )
    return response.choices[0].message.content

system_prompt = open('./refine_system_prompt.txt').read()
goalstep_annotation_processed_explict = []
for i in range(len(goalstep_annotation_processed)):
    if i % 10 == 0:
        json.dump(goalstep_annotation_processed_explict, open('./goalstep_annotation_processed_explict.json', 'w'), indent=4)
        print(i)
    if i > 20:
        break
    if len(goalstep_annotation_processed[i]['task_progress_metadata']) < 2:
        continue
    try:
        user_prompt = sample2planqa(goalstep_annotation_processed[i], 0.5)
        duration = user_prompt['duration']
        if duration < 60 or duration > 3600:
            continue
        refine_conve = json.loads(get_llm_reponse_json(system_prompt, json.dumps(user_prompt, indent=4)))
        for ori_text, refine_text in zip(user_prompt['conversation'], refine_conve['conversation']):
            if ori_text['role'] == refine_text['role']:
                ori_text['content'] = refine_text['content']
        
        goalstep_annotation_processed_explict.append(ori_text)
    except:
        print(i)
        continue
    


json.dump(goalstep_annotation_processed_explict, open('./goalstep_annotation_processed_explict_v2.json', 'w'), indent=4)


0
10
20


In [16]:
fps = 1
for sample in goalstep_annotation_processed_explict:
    for sample2 in goalstep_annotation_processed:
        is_match = False
        if sample['video_uid'] == sample2['sample_id']:
            is_match = True
            break
        
        
    if is_match:
        sample['start_time']= sample2['task_start_frame'] / fps,
        sample['end_time']= sample2['current_observation_frame'] / fps,
    else:
        print(sample['video_uid'])
        raise ValueError('No match found')


json.dump(goalstep_annotation_processed_explict, open('./goalstep_annotation_processed_explict.json', 'w'), indent=4)

grp-e191e0de-e5704925-9cbb-e05fe1132a47


ValueError: No match found

In [4]:
import json
data = json.load(open('/home/zhangyl/videollm-online/data/estp/egoplan/goalstep_annotation_processed_explict.json'))
output_data = []
for i,d in enumerate(data):
    try:
        output_data.append(json.loads(d))
    except:
        print(d)
        continue

json.dump(output_data, open('/home/zhangyl/videollm-online/data/estp/egoplan/goalstep_annotation_processed_explict.json', 'w'), indent=4)

{
    "video_id": "269eea13-c70a-42f6-aba5-41ef622d3112",
    "conversation": [
        {
            "role": "user",
            "content": "Can you walk me through the entire process of achieving my goal, such as frying dough, one step at a time?",
            "time": 1984.18177
        },
        {
            "role": "assistant",
            "content": "Certainly! Let's begin by cutting the dough into manageable pieces.",
            "time": 1984.68177
        },
        {
            "role": "assistant",
            "content": "Next, mold the dough with your hands to prepare it for shaping.",
            "time": 1992.06345
        },
        {
            "role": "assistant",
            "content": "Now, flatten the dough to your desired thickness.",
            "time": 2010.4514
        },
        {
            "role": "assistant",
            "content": "Place the flatbread on a heated skillet to start baking.",
            "time": 2035.79193
        },
        {
            "ro

In [5]:
file = '/home/zhangyl/videollm-online/data/estp/annotations/goalstep_annotation_processed_explict.json'

data = json.load(open(file))
for conv in data:
    conv['video_uid'] = conv['video_id']
json.dump(data, open(file, 'w'), indent=4)