In [1]:
EGO_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d/'
CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_action_caption/train_0_merge'
MOVE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_move_action_caption/train_0_merge'
SCENE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_scene_caption/train_0_merge'

import json
import os
import tqdm

from video_caption_action_scene import AnnotationLoader, BetaAlphaCalculator

EGO_VERSION_ROOT = os.path.join(EGO_ROOT, 'v2')
json_path = os.path.join(EGO_ROOT, 'ego4d.json')
train_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_train.json'
val_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_val.json'
origin_path = f'{EGO_VERSION_ROOT}/annotations/all_narrations_redacted.json'
video_root = f'{EGO_VERSION_ROOT}/full_scale_2fps'
alpha = 4.9
device = 'cuda:3'
caption_dir = '/root/videollm-online/tmp5'

annotation_loader = AnnotationLoader(train_path, val_path, origin_path, json_path)
data = annotation_loader.get_data()
origin_narration = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(data, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

train_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_train.json'))
val_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_val.json'))
all_caption = {**train_caption, **val_caption}

move_train_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_train.json'))
move_val_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_val.json'))
move_all_caption = {**move_train_caption, **move_val_caption}

scene_train_caption = json.load(open(f'{SCENE_CAPTION_ROOT}/action_caption_train.json'))
scene_val_caption = json.load(open(f'{SCENE_CAPTION_ROOT}/action_caption_val.json'))
scene_all_caption = {**scene_train_caption, **scene_val_caption}

def print_json(json_data):
    print(json.dumps(json_data, indent=4))
    
    
def clipuid2cliptime(origin_narrations, video_uid, clip_uid):
    summs = origin_narrations[video_uid]['summaries']
    for summ in summs:
        if summ['_annotation_uid'] == clip_uid:
            break
    clip_start_time = summ['start_time']
    clip_end_time = summ['end_time']
    return clip_start_time, clip_end_time

def read_qa(video_uid, clip_uid, qas):
    qa = qas[video_uid][clip_uid]
    return qa

def transformqa(qa):
    qa_text = []
    for q in qa:
        qa_text.append({
            'question': q['Question'],
            'answer': q['conversation'][0]['content'],
        })
    return qa_text
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clipuid2cliptime(origin_narrations, video_uid, clip_uid):
    summs = origin_narrations[video_uid]['summaries']
    for summ in summs:
        if summ['_annotation_uid'] == clip_uid:
            break
    clip_start_time = summ['start_time']
    clip_end_time = summ['end_time']
    return clip_start_time, clip_end_time

def read_qa(video_uid, clip_uid, qas):
    qa = qas[video_uid][clip_uid]
    return qa

def transformqa(qa):
    qa_text = []
    for q in qa:
        qa_text.append({
            'question': q['Question'],
            'answer': q['conversation'][0]['content'],
        })
    return qa_text

In [57]:
print(clipuid2cliptime(origin_narration, '26202090-684d-4be8-b3cc-de04da827e91','58f45fb1-165a-425c-8194-cf9f699bf324'))

(2159.9823042879966, 2460.0033329333332)


In [8]:
annotations = "move_action_function_v12"
qas = json.load(open(f'/home/zhangyl/videollm-online/data/estp/annotations/{annotations}.json'))
output_dir_move_action = f"/home/zhangyl/videollm-online/dataset/{annotations}_judge_v2_move"
output_dir_action = f"/home/zhangyl/videollm-online/dataset/{annotations}_judge_v4_action"


def find_times(time_list, interval, key='time'):
    if not time_list:
        return []
    
    result = [time_list[0]]
    
    for i in range(1, len(time_list)):
        if (time_list[i][key] - time_list[i-1][key] > interval) and (time_list[i][key] - time_list[0] > interval):
            result.append(time_list[i])
            
    return result

def get_relative_time_caption(captions, video_uid, clip_uid, qq_uid):
    sample = f'{video_uid}_{clip_uid}_{qq_uid}_judge'
    judge_file = json.load(open(os.path.join(output_dir_action, f'{sample}.json')))
    
    relative_time_caption = []
    for k,v in judge_file.items():
        if v['is_relational']:
            cap = captions[video_uid][clip_uid][int(k)].copy()
            cap['reason'] = v['reason']
            relative_time_caption.append(cap)
    
    return relative_time_caption


def refine_qa_time(qas,sample, output_qas, all_relative_qas):
    action_judge = json.load(open(os.path.join(output_dir_action, f'{sample}.json')))
    move_action_judge = json.load(open(os.path.join(output_dir_move_action, f'{sample}.json')))
    
    video_uid, clip_uid, qa_uid = sample.split('_')[:3]
    start_time, end_time = clipuid2cliptime(origin_narration, video_uid, clip_uid)
    print(qa_uid)
    qa = qas[video_uid][clip_uid][int(qa_uid)]

    relative_caption = []
    
    for k,v in action_judge.items():
        if v['is_relational']:
            cap = scene_all_caption[video_uid][clip_uid][int(k)].copy()
            cap['reason'] = v['reason']
            relative_caption.append(cap)

    for k,v in move_action_judge.items():
        if v['is_relational']:
            cap = move_all_caption[video_uid][clip_uid][int(k)].copy()
            cap['reason'] = v['reason']
            relative_caption.append(cap)
    
    sorted_relative_caption = sorted(relative_caption, key=lambda x: x['start_time'])
    if len(sorted_relative_caption) == 0:
        return
    print_json(qa)
    
    print_json(move_all_caption[video_uid][clip_uid])
    print('----------------')
    print_json(scene_all_caption[video_uid][clip_uid])
    print('----------------')
    print_json(sorted_relative_caption)
    new_convsation = []
    last_time = start_time
    for i,cap in enumerate(sorted_relative_caption):
        if 'pre_scene' in cap.keys():
            last_time = cap['end_time']
            continue
        if (cap['end_time'] - last_time) > 15 or i == 0:
            new_convsation.append({
                'role': 'assistant',
                'caption': cap['caption'],
                'content': qa['conversation'][0]['content'],
                'reason': cap['reason'],
                'time': cap['stamp_time'],
                'start_time': cap['start_time'],
                'end_time': cap['end_time'],
                'preview_time':cap['end_time'] - last_time
            })
        last_time = cap['time']
    qa['conversation'] = new_convsation
    qa['clip_start_time'] = start_time
    qa['clip_end_time'] = end_time
    qa['qa_uid'] = qa_uid
    print(video_uid, clip_uid, qa_uid, start_time, end_time)
    print(json.dumps(qa, indent=4))
    
    if video_uid not in output_qas:
        output_qas[video_uid] = {}
    if clip_uid not in output_qas[video_uid]:
        output_qas[video_uid][clip_uid] = []
    output_qas[video_uid][clip_uid].append(qa)
    
    
    if video_uid not in all_relative_qas:
        all_relative_qas[video_uid] = {}
    if clip_uid not in all_relative_qas[video_uid]:
        all_relative_qas[video_uid][clip_uid] = []
        
    all_relative_qas[video_uid][clip_uid].append({
        'qa_uid': qa_uid,
        'relative_caption': sorted_relative_caption,
        'question': qa['question'],
        'answer': qa['conversation'][0]['content'],
        'clip_start_time': start_time,  
        'clip_end_time': end_time,
    })
    
    return

output_qas = {}
all_relative_qas = {}
for file in sorted(os.listdir(output_dir_move_action)):
    if file.endswith('_judege.json'):
        try:
            refine_qa_time(qas, file.split('.')[0], output_qas, all_relative_qas) # not judge file
        except:
            print(file)
            continue



with open(f'/home/zhangyl/videollm-online/data/estp/annotations/{annotations}_refine.json', 'w') as f:
    json.dump(output_qas, f, indent=4)
    
with open(f'/home/zhangyl/videollm-online/data/estp/annotations/{annotations}_all.json', 'w') as f:
    json.dump(all_relative_qas, f, indent=4)



0
{
    "Task Type": "Object Function",
    "question": "I need to organize my tools for a repair project. Where should I start?",
    "visual_cues": "The white cloth, red toolbox, and pegboard wall are clearly visible and well-positioned for tool organization.",
    "conversation": [
        {
            "role": "assistant",
            "content": "Begin by using the white cloth laid out in the center of the workspace to organize your tools. Place the cylindrical metal part and other components on the cloth for easy access. The red toolbox to the right can store additional tools, and the pegboard wall behind the workbench is ideal for hanging frequently used items.",
            "time": 179.62174386666666,
            "start_time": 178.82228922112424,
            "end_time": 180.4211985122091
        }
    ],
    "clip_start_time": 0.0210286,
    "clip_end_time": 299.98769526666666
}
[
    {
        "caption": "The video appears to be shot from a first-person perspective, likely usin

# refine time

In [3]:
import random

def get_relative_time_caption(captions, video_uid, clip_uid, qq_uid, judge_dir):
    sample = f'{video_uid}_{clip_uid}_{qq_uid}_judge'
    try:
        judge_file = json.load(open(os.path.join(judge_dir, f'{sample}.json')))
    except:
        return []
    
    mode = judge_dir.split('_')[-1].replace('/', '')
    
    relative_time_caption = []
    for k,v in judge_file.items():
        try:
            if v['is_relational']:

                cap = captions[video_uid][clip_uid][int(k)].copy()
                cap['reason'] = v['reason']
                if 'answer' in v.keys():
                    cap['content'] = v['answer']
                
                if mode == 'narration':
                    t = cap['time']
                    beta = beta_map.get(clip_uid, 0)
                    start_time = t - beta / (2 * alpha)
                    end_time = t + beta / (2 * alpha)
                    
                    cap['start_time'] = start_time
                    cap['end_time'] = end_time
                    cap['caption'] = cap['text']
                
                relative_time_caption.append(cap)
        except:
            continue
    
    return relative_time_caption

from collections import defaultdict
import copy

def optimized_merge(segments):
    if not segments:
        return []
    sorted_segments = sorted(segments, key=lambda x: (x['start_time'], x['end_time']))
    
    merged = [copy.deepcopy(sorted_segments[0])]
    current_end = merged[0]['end_time']
    
    type_registry = defaultdict(lambda: None)
    for key in merged[0]:
        if isinstance(merged[0][key], (int, float)):
            type_registry[key] = 'numeric'
        elif key not in ['start_time', 'end_time']:
            type_registry[key] = 'set'

    for seg in sorted_segments[1:]:
        if seg['start_time'] <= current_end:
            current_end = max(current_end, seg['end_time'])
            merged[-1]['end_time'] = current_end
            
            for key, value in seg.items():
                if key in ['start_time', 'end_time']:
                    continue
                
                if key in merged[-1]:
                    if type_registry[key] == 'numeric':
                        merged[-1][key] += value
                    else:
                        if isinstance(merged[-1][key], set):
                            if isinstance(value, list):
                                merged[-1][key].update(value)
                            else:
                                merged[-1][key].add(value)

                        else:
                            if isinstance(value, list) and isinstance(merged[-1][key], list):
                                merged[-1][key] = merged[-1][key] + value
                                type_registry[key] = 'set'
                            elif isinstance(value, list) or isinstance(merged[-1][key], list):
                                merged[-1][key] = {merged[-1][key], *value} if not isinstance(merged[-1][key], list) else {*merged[-1][key], value}
                                type_registry[key] = 'set'
                                
                            else:
                                merged[-1][key] = {merged[-1][key], value}
                                type_registry[key] = 'set'
                else:
                    merged[-1][key] = copy.deepcopy(value)
                    if isinstance(value, (int, float)):
                        type_registry[key] = 'numeric'
                    else:
                        type_registry[key] = 'set'
        else:
            merged.append(copy.deepcopy(seg))
            current_end = seg['end_time']
            
    for seg in merged:
        for key in seg:
            if isinstance(seg[key], set):
                seg[key] = list(seg[key])
                if len(seg[key]) == 1:
                    seg[key] = seg[key][0]
    
    return merged

def refine_qa_time(qa, video_uid, clip_uid, qa_uid, output_qas, judge_dir):
    # 1. find relative segment
    mode = judge_dir.split('_')[-1].replace('/', '')
    if mode == 'narration':
        relative_time_caption = get_relative_time_caption(data, video_uid, clip_uid, qa_uid, judge_dir)
    elif mode == 'action':
        relative_time_caption = get_relative_time_caption(all_caption, video_uid, clip_uid, qa_uid, judge_dir)
    elif mode == 'move':
        relative_time_caption = get_relative_time_caption(move_all_caption, video_uid, clip_uid, qa_uid, judge_dir)
    elif mode == 'scene':
        relative_time_caption = get_relative_time_caption(scene_all_caption, video_uid, clip_uid, qa_uid, judge_dir)
    else:
        raise ValueError('mode not supported')
    
    # if len(relative_time_caption) == 0:
    #     return
    
    # 2. cat to conversation
    relative_time_caption = optimized_merge(relative_time_caption)
    new_conversation = []
    for cap in relative_time_caption:
        new_conversation.append({
            'role': 'assistant',
            'caption': cap['caption'],
            'content': cap['content'],
            'reason': cap['reason'],
            'start_time': cap['start_time'],
            'end_time': cap['end_time'],
            'stamp_time': cap['start_time'] + (cap['end_time'] - cap['start_time']) / 2,
        })
    
    # 3. store in output_qas
    new_conversation.extend(qa['conversation'])
    new_conversation = optimized_merge(new_conversation)
    
    for conv in new_conversation:
        if 'time' not in conv.keys():
            conv['time'] = conv['stamp_time']
        if not isinstance(conv['content'], str): 
            conv['content'] = random.choice(conv['content'])
    qa['conversation'] = new_conversation
    
    if video_uid not in output_qas:
        output_qas[video_uid] = {}
    if clip_uid not in output_qas[video_uid]:
        output_qas[video_uid][clip_uid] = []
    output_qas[video_uid][clip_uid].append(qa)
    

anno_file = "c_soc_annos"
mode = 'narration'
save_dir = f'/home/zhangyl/videollm-online/data/estp/annotation_train/0'


# move_action_function_v12_judge_v4_action
# anno_file = 'move_action_function_valid_1_reason'
# mode = 'scene'
postfix = 'judge_v2'

output_dir = f'/home/zhangyl/videollm-online/dataset/{anno_file}_{postfix}_{mode}/'
print(output_dir)
qas = json.load(open(f'{save_dir}/{anno_file}.json'))

output_qas = {}
for video_uid in qas.keys():
    for clip_uid in qas[video_uid].keys():
        for qa_uid, qa in enumerate(qas[video_uid][clip_uid]):
            refine_qa_time(qa, video_uid, clip_uid, qa_uid, output_qas, output_dir)
            # except:
            #     print(video_uid, clip_uid, qa_uid)
            #     continue


with open(f'{save_dir}/{anno_file}_refine.json', 'w') as f:
    json.dump(output_qas, f, indent=4)

/home/zhangyl/videollm-online/dataset/c_soc_annos_judge_v2_narration/


In [44]:
a = {1,2}
print(type(a))

<class 'set'>


# o soc

In [59]:
EGO_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d/'
CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_action_caption'
MOVE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_move_action_caption'

import json
import os
import tqdm
from openai import OpenAI

from video_caption_action_scene import AnnotationLoader, BetaAlphaCalculator

EGO_VERSION_ROOT = os.path.join(EGO_ROOT, 'v2')
json_path = os.path.join(EGO_ROOT, 'ego4d.json')
train_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_train.json'
val_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_val.json'
origin_path = f'{EGO_VERSION_ROOT}/annotations/all_narrations_redacted.json'
video_root = f'{EGO_VERSION_ROOT}/full_scale_2fps'

alpha = 4.9
device = 'cuda:3'
caption_dir = '/root/videollm-online/tmp5'

annotation_loader = AnnotationLoader(train_path, val_path, origin_path, json_path)
narrations = annotation_loader.get_data()
origin_narrations = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(narrations, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

def get_llm_response_json(system_prompt, user_prompt):

    client = OpenAI(
        api_key="",
        base_url="https://api.deepseek.com",
    )
    
    messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}]
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
    )
    
    return response.choices[0].message.content


def clean_text(src: str):
    # 1. remove #
    dst = src.replace('#C', '').replace('#c', '').replace('@c', '').replace('@C', '').replace('C ', '').replace('c ', '')
    dst = dst.replace('#O', '').replace('#o', '').replace('@o', '').replace('@O', '').replace('O ', '').replace('o ', '')
    dst = dst.replace('#Unsure', '').replace('#unsure', '')
    dst = dst.replace('#', '')
    # 2. remove start&end extra space and ,.
    dst = dst.strip('.,\n ') + '.'
    # 3. make the first word capitalize and remove extra space within the sentence
    words = dst.split()
    dst = ' '.join(words)
    
    return dst

system_prompt = open('/home/zhangyl/videollm-online/data/estp/soc/fho_system.txt').read().format(NUMBER=1)
user_prompt_templete = open('/home/zhangyl/videollm-online/data/estp/soc/fho_prompt.txt').read()


soc_annos = json.load(open('/home/zhangyl/videollm-online/dataset/soc_o_v1/soc_select.json'))
output_dir = '/home/zhangyl/videollm-online/dataset/soc_o_v1/'
o_soc_annos = {}

for k, annos in soc_annos.items():  
    for clip_id, clip_annos in annos.items():
        if len(clip_annos) == 0:
            continue
            
        for i, action in enumerate(clip_annos):
            os.makedirs(os.path.join(output_dir, k, clip_id), exist_ok=True)
            
            # HACK: add narration clip---------------
            if k not in o_soc_annos.keys():
                o_soc_annos[k] = {}
            if action['narration_annotation_uid'] not in o_soc_annos[k].keys():
                o_soc_annos[k][action['narration_annotation_uid']] = []
            
            summs = origin_narrations[k]['summaries']
            for summ in summs:
                if summ['_annotation_uid'] == action['narration_annotation_uid']:
                    print('match success')
                    break
            clip_start_time = summ['start_time']
            clip_end_time = summ['end_time']
            # HACK: add narration clip---------------
            
            user_prompt = user_prompt_templete.format(clean_text(action['narration_text']))
            response = get_llm_response_json(system_prompt, user_prompt)
            
            # HACK: add format json---------------
            responselist = response.split('\n')
            for res in responselist:
                if res.startswith('**Q:**'):
                    question = res.split('**Q:**')[1].strip()
                elif res.startswith('**A:**'):
                    answer = res.split('**A:**')[1].strip()
            
            qa = {
                'clip_start_time': clip_start_time,
                'clip_end_time': clip_end_time,
                'question': question,
                'conversation': [
                    {
                        'role': 'user',
                        'content': answer,
                        'time': action['narration_timestamp_sec'],
                        'start_time': action['start_sec'],
                        'end_time': action['end_sec'],
                    }
                ]
            }
            o_soc_annos[k][action['narration_annotation_uid']].append(qa) 
            # HACK: add format json---------------
            
            
            with open(os.path.join(output_dir, k, clip_id, f'{i}_q.txt'), 'w') as f:
                f.write(user_prompt)
            
            with open(os.path.join(output_dir, k, clip_id, f'{i}_gen.txt'), 'w') as f:
                f.write(response)

with open(os.path.join(output_dir, 'o_soc_annos.json'), 'w') as f:
    json.dump(o_soc_annos, f, indent=4)

match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match success
match 