In [1]:
EGO_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d/'
CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_action_caption'
MOVE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_move_action_caption'
SCENE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_scene_caption'

import json
import os
import tqdm

from video_caption_action_scene import AnnotationLoader, BetaAlphaCalculator

EGO_VERSION_ROOT = os.path.join(EGO_ROOT, 'v2')
json_path = os.path.join(EGO_ROOT, 'ego4d.json')
train_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_train.json'
val_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_val.json'
origin_path = f'{EGO_VERSION_ROOT}/annotations/all_narrations_redacted.json'
video_root = f'{EGO_VERSION_ROOT}/full_scale_2fps'
alpha = 4.9
device = 'cuda:3'
caption_dir = '/root/videollm-online/tmp5'

annotation_loader = AnnotationLoader(train_path, val_path, origin_path, json_path)
data = annotation_loader.get_data()
origin_narration = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(data, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

train_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_train.json'))
val_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_val.json'))
all_caption = {**train_caption, **val_caption}

move_train_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_train.json'))
move_val_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_val.json'))
move_all_caption = {**move_train_caption, **move_val_caption}

scene_train_caption = json.load(open(f'{SCENE_CAPTION_ROOT}/action_caption_train.json'))
scene_val_caption = json.load(open(f'{SCENE_CAPTION_ROOT}/action_caption_val.json'))
scene_all_caption = {**scene_train_caption, **scene_val_caption}

video2scene = json.load(open('/home/zhangyl/videollm-online/data/estp/ego4d/metafile/video2scene.json'))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
video2scene = json.load(open('/home/zhangyl/videollm-online/data/estp/ego4d/metafile/video2scene.json'))

# merge caption

In [23]:
def merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration, w_pre = False):
    caption = captions[video_uid][clip_uid]
    
    caption_texts = {}
    for action_idx, cap in enumerate(caption):
        
        caption_texts[action_idx] = {
            'caption': cap['caption'],
            'reason': None,
            'is_relational': None,
        }
        
    return caption_texts

def merge_narration(data, video_uid, clip_uid, video2scene, origin_narration, w_pre = False):
    narration = data[video_uid][clip_uid]
    
    caption_texts = {}
    for action_idx, nar in enumerate(narration):
        
        caption_texts[action_idx] = {
            'caption': nar['text'],
            'reason': None,
            'is_relational': None,
        }
        
    return caption_texts

def merge_caption_with_action(captions, narrations, video_uid, clip_uid, video2scene, origin_narration):
    narration = narrations[video_uid][clip_uid]
    caption = captions[video_uid][clip_uid]
    
    caption_texts = ""
    for action_idx, (nar, cap) in enumerate(zip(narration, caption)):
        action_narration = 'Idx is {}, Time is {}. Action narration is \"'.format(action_idx, nar['time']) + nar['text'] + '\".\n'
        caption_text = 'Detailed Description: \"' + cap['text'] + '\" \n'
        caption_text = action_narration + caption_text
        caption_texts += caption_text + '\n'
    
    return caption_texts

def caption_merger(captions, video2scene, origin_narration):
    for video_uid in captions.keys():
        for clip_uid in captions[video_uid].keys():
            caption_texts = merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration)
            yield caption_texts, video_uid, clip_uid

def read_qa(video_uid, clip_uid, qas):
    qa = qas[video_uid][clip_uid]
    return qa

def transformqa(qa):
    qa_text = []
    for q in qa:
        if 'visual_cues' in q:
            qa_text.append({
                'question': q['question'],
                # 'answer': q['conversation'][0]['content'],
                'visual_cues': q['visual_cues'],
            })
        else:
            qa_text.append({
                'question': q['question'],
                'answer': q['conversation'][0]['content'],
            })
    return qa_text

def qa_gentor(qas):
    for k,v in qas.items():
        for kk,vv in qas[k].items():
            yield k,kk

In [24]:
import json
from openai import OpenAI

def get_llm_reponse_json(system_prompt, user_prompt):
    client = OpenAI(
            api_key="",
            base_url="https://api.deepseek.com",
        )

    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}]

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        response_format={
            'type': 'json_object'
        }
    )
    return response.choices[0].message.content

anno_file = 'c_soc_annos_v2'
postfix = 'judge'
mode = 'narration'


qas = json.load(open(f'/home/zhangyl/videollm-online/data/estp/annotations/{anno_file}.json'))
_qa_gentor = qa_gentor(qas)
prompt_version = 3

output_dir = f'/home/zhangyl/videollm-online/dataset/{anno_file}_{postfix}_{mode}/'
os.makedirs(output_dir, exist_ok=True)
system_prompt = open(f'/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_system_prompt_v{prompt_version}.txt').read()
user_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_user_prompt.txt').read()
step = 10

for n_sample in range(0,50):
    video_uid, clip_uid = next(_qa_gentor)
    # try:
    if mode == 'action':
        caption_texts = merge_caption_wo_action(all_caption, video_uid, clip_uid, video2scene, origin_narration)
    elif mode == 'scene':
        caption_texts = merge_caption_wo_action(scene_all_caption, video_uid, clip_uid, video2scene, origin_narration)
    elif mode == 'move_action':
        caption_texts = merge_caption_wo_action(move_all_caption, video_uid, clip_uid, video2scene, origin_narration)
    elif mode == 'narration':
        caption_texts = merge_narration(data, video_uid, clip_uid, video2scene, origin_narration)
    # except:
    #     continue
    
    qa_list = transformqa(read_qa(video_uid, clip_uid, qas))

    
    for i,qa in enumerate(qa_list):

        n_caption = len(caption_texts.keys())
        final_answer = {}
        for j in range(0, n_caption, step):
            sample_caption_text = {k:v for k, v in caption_texts.items() if j <= k < j+step}
            question = user_prompt.format(json.dumps(sample_caption_text,indent=4), json.dumps(qa, indent=4))
            answer = get_llm_reponse_json(system_prompt, question)
            final_answer.update(json.loads(answer))
        
        question = user_prompt.format(json.dumps(caption_texts,indent=4), json.dumps(qa, indent=4))   
        with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_{i}_caption.txt'), 'w') as f:
            f.write(question)
        with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_{i}_judge.json'), 'w') as f:
            json.dump(final_answer, f, indent=4)
        
        break
    print(n_sample)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


JSONDecodeError: Expecting property name enclosed in double quotes: line 24 column 1 (char 1038)

In [114]:
output_dir = '/home/zhangyl/videollm-online/dataset/move_action_function_v7_judge_v4/'
caption_merger1 = caption_merger(move_all_caption, video2scene, origin_narration)
caption_texts, video_uid, clip_uid = next(caption_merger1)
caption_texts, video_uid, clip_uid = next(caption_merger1)

In [18]:
# for _ in range(0,1):
print(video_uid, clip_uid)
print(json.dumps(read_qa(video_uid, clip_uid, qas), indent=4))
print(json.dumps(move_all_caption[video_uid][clip_uid],indent=4))
print(merge_caption_with_action(all_caption, data, video_uid, clip_uid, video2scene, origin_narration))

for i,qa in enumerate(transformqa(read_qa(video_uid, clip_uid, qas))):
    
    judge = json.load(open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_{i}_judege.json')))
    print(json.dumps(judge, indent=4))


3d8f5230-0c22-4018-86b0-e6d851b74b11 56f7f97d-4be8-4778-8d0e-ab4811b75add
[
    {
        "Task Type": "Information Function",
        "question": "I need to proceed safely. What should I be aware of?",
        "visual_cues": "The red car with the triangular sign, its position relative to your vehicle, and the slow or stationary movement.",
        "conversation": [
            {
                "role": "assistant",
                "time": 288.2834986550782,
                "start_time": 286.32570171630266,
                "end_time": 290.2412955938537
            }
        ],
        "clip_start_time": 268.94702005507816,
        "clip_end_time": 569.9670286
    },
    {
        "Task Type": "Object Function",
        "question": "I need to check the traffic behind me. Where should I look?",
        "visual_cues": "The side mirror's position and the reflections of other vehicles in the parking lot.",
        "conversation": [
            {
                "role": "assistant",
        

FileNotFoundError: [Errno 2] No such file or directory: '/home/zhangyl/videollm-online/dataset/move_action_function_v12_judge_v3_action/3d8f5230-0c22-4018-86b0-e6d851b74b11_56f7f97d-4be8-4778-8d0e-ab4811b75add_1_judege.json'

In [124]:
system_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_system_prompt_v2.txt').read()
user_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_user_prompt.txt').read()
caption_texts = merge_caption_wo_action(move_all_caption, video_uid, clip_uid, video2scene, origin_narration)
with open('./tmp.txt', 'w') as f:
    f.write(user_prompt.format(json.dumps(caption_texts,indent=4), json.dumps(qa, indent=4)))


In [101]:
system_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_system_prompt_v2.txt').read()
user_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/judge_relative_user_prompt.txt').read()
caption_texts = merge_caption_wo_action(move_all_caption, video_uid, clip_uid, video2scene, origin_narration)
for k,v in caption_texts.items():
    question = user_prompt.format(json.dumps(v,indent=4), json.dumps(qa, indent=4))
    answer = json.loads(get_llm_reponse_json(system_prompt, question))
    caption_texts[k]['reason'] = answer['reason']
    caption_texts[k]['is_relational'] = answer['is_relational']
    
print(json.dumps(caption_texts, indent=4))

0
1
{
    "0": {
        "caption": "The video presents a first-person perspective of navigating through various parts of an indoor environment, likely a small living space or apartment. Here is a detailed description of the objects and their positioning as observed:\n\n1. **Initial Scene (Top-left corner):**\n   - A white ceiling with a window allowing natural light.\n   - A coiled white cable hanging from the ceiling.\n\n2. **Backpack and Cleaning Supplies:**\n   - A black backpack placed on a blue plastic storage bin.\n   - Various cleaning supplies like brooms and mops in a green stand nearby.\n\n3. **Kitchen Area:**\n   - White cabinets with golden handles.\n   - A cluttered countertop with kitchen items including jars, bottles, and food packaging.\n\n4. **Refrigerator:**\n   - Open refrigerator showing packed shelves with milk, juice cartons, bread, and other groceries.\n\n5. **Stove Area:**\n   - Stove top with cooking utensils and containers around it.\n   - A person's hand hol

In [96]:
json.dump(caption_texts, open('./tmp1.json', 'w'), indent=4)

In [80]:
def clipuid2cliptime(origin_narrations, video_uid, clip_uid):
    summs = origin_narrations[video_uid]['summaries']
    for summ in summs:
        if summ['_annotation_uid'] == clip_uid:
            print('match success')
            break
    clip_start_time = summ['start_time']
    clip_end_time = summ['end_time']
    return clip_start_time, clip_end_time

print(clipuid2cliptime(origin_narration, video_uid, clip_uid))

match success
(269.99869786666665, 569.9653645333333)


# refine time

In [159]:
def print_json(json_data):
    print(json.dumps(json_data, indent=4))

In [183]:
qas = json.load(open('/home/zhangyl/videollm-online/data/estp/annotations/move_action_function_v7.json'))

In [184]:
sample =  "016bfe72-74ef-4956-9cc9-7a7ad2f6ab48_a194b487-aa15-4af2-8b0d-d01401aa145c_0"
output_dir_move_action = "/home/zhangyl/videollm-online/dataset/move_action_function_v7_judge_v8"
output_dir_action = "/home/zhangyl/videollm-online/dataset/move_action_function_v7_judge_v8_action"

def refine_qa_time(qas,sample):
    action_judge = json.load(open(os.path.join(output_dir_action, f'{sample}.json')))
    move_action_judge = json.load(open(os.path.join(output_dir_move_action, f'{sample}.json')))
    
    video_uid, clip_uid, qa_uid = sample.split('_')[:3]
    start_time, end_time = clipuid2cliptime(origin_narration, video_uid, clip_uid)

    qa = qas[video_uid][clip_uid][int(qa_uid)]

    relative_caption = []
    for k,v in action_judge.items():
        if v['is_relational']:
            relative_caption.append(all_caption[video_uid][clip_uid][int(k)])

    for k,v in move_action_judge.items():
        if v['is_relational']:
            relative_caption.append(move_all_caption[video_uid][clip_uid][int(k)])

    sorted_relative_caption = sorted(relative_caption, key=lambda x: x['time'])
    print_json(qa['conversation'][0])
    new_convsation = []
    last_time = start_time
    for cap in sorted_relative_caption:
        if 'pre_scene' in cap.keys():
            last_time = cap['end_time']
            continue
        if (cap['stamp_time'] - last_time) > 10:
            new_convsation.append({
                'role': 'assistant',
                'caption': cap['caption'],
                'content': qa['conversation'][0]['content'],
                'time': cap['stamp_time'],
                'start_time': cap['start_time'],
                'end_time': cap['end_time'],
            })
            last_time = cap['time']
    qa['conversation'] = new_convsation
    print(json.dumps(qa, indent=4))
    return

for file in os.listdir(output_dir_move_action):
    if file.endswith('_judege.json'):
        refine_qa_time(qas, file.split('.')[0])


    
    


match success
{
    "role": "assistant",
    "content": "Begin by organizing the countertop, which is cluttered with cleaning supplies, food containers, and utensils. Focus on the sink area, where dishes are piled up, and use the dish rack to the left for drying cleaned items.",
    "time": 935.69098125,
    "start_time": 935.0733048026864,
    "end_time": 936.3086576973137
}
{
    "Task Type": "Object Function",
    "Question": "I need to clean up the kitchen. Where should I start?",
    "conversation": [
        {
            "role": "assistant",
            "caption": "As you look around, the room reveals itself to be a cozy and lived-in space. The walls are painted in a soft, light color that reflects the natural light streaming in from an unseen window. To your right, there's a white refrigerator with various items placed on top of it, including a black appliance and some bottles. White cabinets line the wall above the counter, providing storage for kitchen essentials.\n\nThe coun

FileNotFoundError: [Errno 2] No such file or directory: '/home/zhangyl/videollm-online/dataset/move_action_function_v7_judge_v8_action/004a1802-c546-4dcc-86ba-bf1080077017_f99d9a39-f7ff-44f5-ae2c-6aa7036225f4_0_judege.json'

In [169]:
print_json(move_all_caption[video_uid][clip_uid])

[
    {
        "caption": "The video provides a first-person perspective of someone navigating through various areas within what appears to be a small, cluttered living space. The initial frames show the interior of a room with a ceiling fan and some items on the floor, including a blue storage box and a green chair or stool. As the camera moves, it reveals more details such as a white door with golden handles, a yellow shelf with assorted items like canned goods and cleaning supplies, and a window letting in natural light.\n\nThe focus then shifts to a kitchen area where the camera captures countertops filled with various objects including bottles, containers, and food items. The view transitions to include a stove top with a pot, indicating cooking activity. The surrounding environment is busy with scattered household items, suggesting a lived-in space with limited organization.\n\nSubsequent scenes reveal different parts of the house, including another angle of the kitchen showing 