# load caption

In [1]:
EGO_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d/'
CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_action_caption/train_0_merge'
MOVE_CAPTION_ROOT = '/home/zhangyl/videollm-online/datasets/ego4d_move_action_caption/train_0_merge'

import json
import os
import tqdm

from video_caption_action_scene import AnnotationLoader, BetaAlphaCalculator

EGO_VERSION_ROOT = os.path.join(EGO_ROOT, 'v2')
json_path = os.path.join(EGO_ROOT, 'ego4d.json')
train_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_train.json'
val_path = f'{EGO_VERSION_ROOT}/annotations/refined_narration_stream_val.json'
origin_path = f'{EGO_VERSION_ROOT}/annotations/all_narrations_redacted.json'
video_root = f'{EGO_VERSION_ROOT}/full_scale_2fps'
alpha = 4.9
device = 'cuda:3'
caption_dir = '/root/videollm-online/tmp5'

annotation_loader = AnnotationLoader(train_path, val_path, origin_path, json_path)
data = annotation_loader.get_data()
origin_narration = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(data, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

train_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_train.json'))
val_caption = json.load(open(f'{CAPTION_ROOT}/action_caption_val.json'))
all_caption = {**train_caption, **val_caption}

move_train_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_train.json'))
move_val_caption = json.load(open(f'{MOVE_CAPTION_ROOT}/action_caption_val.json'))
move_all_caption = {**move_train_caption, **move_val_caption}
video2scene = json.load(open('/home/zhangyl/videollm-online/data/estp/ego4d/metafile/video2scene.json'))

  from .autonotebook import tqdm as notebook_tqdm


# object QA

## action merge

In [67]:
def merge_caption_with_action(captions, narrations, video_uid, clip_uid, video2scene, origin_narration):
    narration = narrations[video_uid][clip_uid]
    caption = captions[video_uid][clip_uid]
    
    caption_texts = ""
    for action_idx, (nar, cap) in enumerate(zip(narration, caption)):
        action_narration = 'Time is {}. Action narration is \"'.format(nar['time']) + nar['text'] + '\".\n'
        caption_text = 'Detailed Description: \"' + cap['text'] + '\" \n'
        caption_text = action_narration + caption_text
        caption_texts += caption_text + '\n'
        
    return caption_texts

def caption_merger_waction(captions, video2scene, origin_narration):
    for video_uid in captions.keys():
        for clip_uid in captions[video_uid].keys():
            caption_texts = merge_caption_with_action(captions, data, video_uid, clip_uid, video2scene, origin_narration)
            yield caption_texts, video_uid, clip_uid

caption_merger2 = caption_merger_waction(all_caption, video2scene, origin_narration)



In [68]:
print(next(caption_merger2)[0])

Time is 270.0. Action narration is "You fold the luggage.".
Detailed Description: "With a sense of purpose, you begin to fold the luggage. The beige fabric of the suitcase is well-worn but still sturdy as it sits on the floor in front of you. You methodically work through the steps: first, you gather the edges and bring them together, then you tuck in the straps, ensuring they are neatly tucked away. As you continue, the once sprawling mass of fabric gradually transforms into a compact, manageable size, ready for transport or storage." 

Time is 301.29231. Action narration is "You look around.".
Detailed Description: "As you look around, the room appears dimly lit with a warm, yellowish hue. The walls are painted in a light beige color, and there's a noticeable shadow cast by an unseen object or person. In front of you, on the floor, lies a large, brown blanket that seems to be spread out, possibly covering something underneath. To your right, part of a white furry pet is visible, its 

In [None]:
import json
from openai import OpenAI
import os

def get_llm_response_json(system_prompt, user_prompt):

    client = OpenAI(
        api_key="",
        base_url="https://api.deepseek.com",
    )
    
    messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}]
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
    )
    
    return response.choices[0].message.content

system_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2q_system.txt').read().format(NUMBER=3)
user_prompt_templete = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2q_prompt.txt').read()
output_dir = '/home/zhangyl/videollm-online/dataset/action/'
os.makedirs(output_dir, exist_ok=True)


system_prompt_a = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2a_system.txt').read()
user_prompt_templete_a = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2a_prompt.txt').read()

for i in range(0,10):

    caption_texts, video_uid, clip_uid = next(caption_merger2)

    user_prompt = user_prompt_templete.format(caption_texts)
    question = get_llm_response_json(system_prompt, user_prompt)
    
    user_prompt = user_prompt_templete_a.format(caption_texts, question)
    answer = get_llm_response_json(system_prompt_a, user_prompt)

    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_caption.txt'), 'w') as f:
        f.write(user_prompt)
    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_q.txt'), 'w') as f:
        f.write(question)
    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_a.txt'), 'w') as f:
        f.write(answer)
        
        
        

NameError: name 'caption_merger2' is not defined

## move action

In [5]:
def merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration, w_pre = False):
    caption = captions[video_uid][clip_uid]
    
    caption_texts = ""
    for action_idx, cap in enumerate(caption):
        
        if 'stamp_time' not in cap and w_pre:
            action_narration = 'Time is {} - {}.\n'.format(cap['start_time'], cap['end_time'])
        elif 'stamp_time' not in cap and not w_pre:
            continue
        else:
            action_narration = 'Time is {}.\n'.format(cap['stamp_time'])
        caption_text = 'Detailed Description: \"' + cap['text'] + '\".\n'
        caption_text = action_narration + caption_text
        caption_texts += caption_text + '\n\n'
        
    return caption_texts

def caption_merger(captions, video2scene, origin_narration, w_pre = False):
    for video_uid in captions.keys():
        for clip_uid in captions[video_uid].keys():
            caption_texts = merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration, w_pre = False)
            yield caption_texts, video_uid, clip_uid

caption_merger1 = caption_merger(move_all_caption, video2scene, origin_narration)

In [None]:
import json
from openai import OpenAI
import os

def get_llm_response_json(system_prompt, user_prompt):

    client = OpenAI(
        api_key="",
        base_url="https://api.deepseek.com",
    )
    
    messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}]
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
    )
    
    return response.choices[0].message.content

system_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/caption2qa_system.txt').read().format(NUMBER=2)
user_prompt_templete = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/caption2qa_prompt.txt').read()
output_dir = '/home/zhangyl/videollm-online/dataset/move_action_v21/'
os.makedirs(output_dir, exist_ok=True)

system_prompt_a = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2a_system.txt').read()
user_prompt_templete_a = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/action_caption2a_prompt.txt').read()

for i in range(0,10):

    caption_texts, video_uid, clip_uid = next(caption_merger1)

    user_prompt = user_prompt_templete.format(caption_texts)
    answer = get_llm_response_json(system_prompt, user_prompt)

    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_caption.txt'), 'w') as f:
        f.write(user_prompt)
    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_qa.txt'), 'w') as f:
        f.write(answer)
    
    # user_prompt = user_prompt_templete_a.format(caption_texts, answer)
    # answer = get_llm_response_json(system_prompt_a, user_prompt)
    
    # with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_caption2.txt'), 'w') as f:
    #     f.write(user_prompt)
    # with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_a.txt'), 'w') as f:
    #     f.write(answer)
    
        
        

# function QA

## move action

In [2]:
def merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration, w_pre = False):
    caption = captions[video_uid][clip_uid]
    
    caption_texts = "Video Subject : {}\n".format(' / '.join(video2scene[video_uid]))
    for action_idx, cap in enumerate(caption):
        
        if 'stamp_time' not in cap and w_pre:
            action_narration = 'Time is {} - {}.\n'.format(cap['start_time'], cap['end_time'])
        elif 'stamp_time' not in cap and not w_pre:
            continue
        else:
            action_narration = 'Time is {}.\n'.format(cap['stamp_time'])
        caption_text = 'Detailed Description: \"' + cap['text'] + '\".\n'
        caption_text = action_narration + caption_text
        caption_texts += caption_text + '\n\n'
        
    return caption_texts

def caption_merger(captions, video2scene, origin_narration):
    for video_uid in captions.keys():
        for clip_uid in captions[video_uid].keys():
            caption_texts = merge_caption_wo_action(captions, video_uid, clip_uid, video2scene, origin_narration)
            yield caption_texts, video_uid, clip_uid

caption_merger1 = caption_merger(move_all_caption, video2scene, origin_narration)

In [None]:
import json
from openai import OpenAI
import os

def get_llm_response_json(system_prompt, user_prompt):

    client = OpenAI(
        api_key="",
        base_url="https://api.deepseek.com",
    )
    
    messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}]
    
    response = client.chat.completions.create(
        model="deepseek-reasoner", # deepseek-reasoner deepseek-chat
        messages=messages,
    )
    
    return response.choices[0].message.content

system_prompt = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/caption2qa_function_system.txt').read().format(NUMBER=2)
user_prompt_templete = open('/home/zhangyl/videollm-online/data/estp/ego4d/prompt/caption2qa_function_prompt.txt').read()
output_dir = '/home/zhangyl/videollm-online/dataset/move_action_function_v1_reason/'
os.makedirs(output_dir, exist_ok=True)

for _ in range(0,10):
    
    caption_texts, video_uid, clip_uid = next(caption_merger1)
    user_prompt = user_prompt_templete.format(caption_texts)
    answer = get_llm_response_json(system_prompt, user_prompt)

    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_caption.txt'), 'w') as f:
        f.write(user_prompt)
    with open(os.path.join(output_dir, f'{video_uid}_{clip_uid}_qa.txt'), 'w') as f:
        f.write(answer)
        
    

## action

In [72]:
def merge_caption_with_action(captions, narrations, video_uid, clip_uid, video2scene, origin_narration):
    narration = narrations[video_uid][clip_uid]
    caption = captions[video_uid][clip_uid]
    
    caption_texts = "Video Subject : {}\n".format(' / '.join(video2scene[video_uid]))
    for action_idx, (nar, cap) in enumerate(zip(narration, caption)):
        action_narration = 'Time is {}. Action narration is \"'.format(nar['time']) + nar['text'] + '\".\n'
        caption_text = 'Detailed Description: \"' + cap['text'] + '\" \n'
        caption_text = action_narration + caption_text
        caption_texts += caption_text + '\n'
        
    return caption_texts

def caption_merger_waction(captions, video2scene, origin_narration):
    for video_uid in captions.keys():
        for clip_uid in captions[video_uid].keys():
            caption_texts = merge_caption_with_action(captions, data, video_uid, clip_uid, video2scene, origin_narration)
            yield caption_texts, video_uid, clip_uid

caption_merger2 = caption_merger_waction(all_caption, video2scene, origin_narration)



In [73]:
print(next(caption_merger2)[0])

Video Subject : BBQ'ing/picnics / Walking on street
Time is 270.0. Action narration is "You fold the luggage.".
Detailed Description: "With a sense of purpose, you begin to fold the luggage. The beige fabric of the suitcase is well-worn but still sturdy as it sits on the floor in front of you. You methodically work through the steps: first, you gather the edges and bring them together, then you tuck in the straps, ensuring they are neatly tucked away. As you continue, the once sprawling mass of fabric gradually transforms into a compact, manageable size, ready for transport or storage." 

Time is 301.29231. Action narration is "You look around.".
Detailed Description: "As you look around, the room appears dimly lit with a warm, yellowish hue. The walls are painted in a light beige color, and there's a noticeable shadow cast by an unseen object or person. In front of you, on the floor, lies a large, brown blanket that seems to be spread out, possibly covering something underneath. To yo

# format

In [5]:
import json, os

def is_time(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def process_input(input_list):
    result = []
    current_sentence = None
    prev_is_sentence = False
    
    for item in input_list:
        if is_time(item):
            if prev_is_sentence:
                # 前一个元素是句子，直接添加时间
                result.append(item)
                prev_is_sentence = False
            else:
                # 前一个不是句子，需要补充句子和时间
                if current_sentence is not None:
                    result.append(current_sentence)
                    result.append(item)
                    prev_is_sentence = False
                # 如果current_sentence是None，则忽略（根据题目描述，输入的第一个元素是句子）
        else:
            # 处理句子
            result.append(item)
            current_sentence = item
            prev_is_sentence = True
    return result


def clipuid2cliptime(origin_narrations, video_uid, clip_uid):
    summs = origin_narrations[video_uid]['summaries']
    is_match = False
    for summ in summs:
        if summ['_annotation_uid'] == clip_uid:
            is_match = True
            break
    if not is_match:
        return None, None
    clip_start_time = summ['start_time']
    clip_end_time = summ['end_time']
    return clip_start_time, clip_end_time


def parse_qa_file(file_path, video_uid, clip_uid):
    """
    解析 .txt 文件，将每一组 QA 和任务类型转化为字典
    
    参数:
    - file_path: .txt 文件路径
    
    返回:
    - qa_list: 包含所有 QA 和任务类型的字典列表
    """
    qa_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        
    step_idx = 0
    for i, line in enumerate(lines):
        if 'Step 2' in line:
            step_idx = i
    lines = lines[step_idx:]
    
    clip_start_time, clip_end_time = clipuid2cliptime(origin_narration, video_uid, clip_uid)
    if clip_end_time is None:
        return []
    current_qa = {}
    current_a = []
    for line in lines:
        line = line.strip()
        line = line.replace('*', '')
        if "Task Type:" in line:
            current_qa["Task Type"] = line.split(":")[1]
        elif "Question:" in line or "Q:" in line:
            current_qa["question"] = line.split(":")[1].strip()
        elif 'Visual cues' in line or 'Visual Cues' in line:
            try:
                current_qa["visual_cues"] = line
            except:
                continue
        elif "Answer:" in line or "A:" in line:
            current_a.append(line.split(":")[1].strip())
        elif "Time:" in line:
            current_a.append(line.split(":")[1].strip())
        elif line == "":
            if current_qa:
                conversation = []
                current_a = process_input(current_a)
                    
                for i in range(len(current_a)):
                    if i % 2 == 0:
                        beta = beta_map.get(clip_uid, 0)
                        try:
                            t = float(current_a[i+1])
                        except:
                            conversation = []
                            break
                        start_time = t - beta / (2 * alpha)
                        end_time = t + beta / (2 * alpha)
                        conversation.append(
                            {
                                'role': 'assistant',
                                'content': current_a[i],
                                'time': t,
                                'start_time': start_time,
                                'end_time': end_time,
                            }
                        )
                if conversation:
                    current_qa["conversation"] = conversation
                    qa_list.append(current_qa)
                current_qa = {}
                current_a = []
    # 添加最后一组 QA（如果文件末尾没有空行）
    if current_qa:
        conversation = []
        current_a = process_input(current_a)
        for i in range(len(current_a)):
            if i % 2 == 0:
                beta = beta_map.get(clip_uid, 0)
                try:
                    t = float(current_a[i+1])
                except:
                    conversation = []
                    break
                start_time = t - beta / (2 * alpha)
                end_time = t + beta / (2 * alpha)
                conversation.append(
                    {
                        'role': 'assistant',
                        'content': current_a[i],
                        'time': t,
                        'start_time': start_time,
                        'end_time': end_time,
                    }
                )
        if conversation:
            current_qa["conversation"] = conversation
            qa_list.append(current_qa)

        
    for qa in qa_list:
        qa["clip_start_time"]=clip_start_time
        qa["clip_end_time"]=clip_end_time
    return qa_list

version = 'move_action_function_train_0_merge_reason'
postfix = 'qa.txt'
output_dir = f'/home/zhangyl/videollm-online/dataset/{version}/'
file_list = os.listdir(output_dir)
alpha = 2.5
annos = {}
for file in file_list:
    if file.endswith(postfix):
        video_uid, clip_uid = file.split('_')[:2]
        if video_uid not in annos:
            annos[video_uid] = {}
        annos[video_uid][clip_uid] = parse_qa_file(os.path.join(output_dir, file),
                                                   video_uid, clip_uid)

with open(f'/home/zhangyl/videollm-online/data/estp/annotations/{version}.json', 'w') as f:
    json.dump(annos,f, indent=4)

# file = 'ffb6dfc1-d2f9-45b1-8e25-2c7d0f32d635_ebff91cf-95ff-4cb1-95ae-cfbd10b77829_qa.txt'
# video_uid, clip_uid = file.split('_')[:2]
# a = parse_qa_file(os.path.join(output_dir, file),
#                                                    video_uid, clip_uid)

# print(json.dumps(a, indent=4))


In [39]:
for k,v in annos.items():
    for kk,vv in annos[k].items():
        for qa in vv:
            if 'question' not in qa:
                print(k, kk, qa)
                break

In [44]:
def is_time(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def process_input(input_list):
    result = []
    current_sentence = None
    prev_is_sentence = False
    
    for item in input_list:
        if is_time(item):
            if prev_is_sentence:
                # 前一个元素是句子，直接添加时间
                result.append(item)
                prev_is_sentence = False
            else:
                # 前一个不是句子，需要补充句子和时间
                if current_sentence is not None:
                    result.append(current_sentence)
                    result.append(item)
                    prev_is_sentence = False
                # 如果current_sentence是None，则忽略（根据题目描述，输入的第一个元素是句子）
        else:
            # 处理句子
            result.append(item)
            current_sentence = item
            prev_is_sentence = True
    return result

# 示例输入（根据用户提供的例子构造）
input_example = [
    'The object you lift is a golf club.',  # 00
    '540.3254366666666',                    # 01
    'The object you lift is a golf club.',  # 02
    '541.4301566666667',                    # 03
    'The object you lift is a golf club.',  # 04
    '544.6587366666666',                    # 05
    'The object you lift is a golf club.',  # 06
    '548.5609166666667',                    # 07
    'The object you lift is a golf club.',  # 08
    '551.5269866666666',                    # 09
    'The object you lift is a golf club.',  # 10
    '585.1917866666666',                    # 11
    'The object you lift is a golf club.',  # 12
    '589.7279466666666',                    # 13
    'The object you lift is a golf club.',  # 14
    '609.1992066666667',                    # 15
    'The object you lift is a golf club.',  # 16
    '661.8628866666667',                    # 17
    'The object you lift is a golf club.',  # 18
    '671.2641066666666',                    # 19
    'The object you lift is a golf club.',  # 20
    '680.7280166666667',                    # 21
    'The object you lift is a golf club.',  # 22
    '691.8049766666667',                    # 23
    'The object you lift is a golf club.',  # 24
    '740.0249166666666',                    # 25
    'The object you lift is a golf club.',  # 26
    '756.9704666666667',                    # 27
    '759.5288066666667'                     # 28
]

processed_list = process_input(input_example)

# 打印处理后的结果，验证是否正确
for i in range(0, len(processed_list), 2):
    if i+1 < len(processed_list):
        print(f"Sentence: {processed_list[i]}")
        print(f"Time: {processed_list[i+1]}")
    else:
        print(f"Sentence: {processed_list[i]}")

# 如果需要输出为列表，可以直接使用 processed_list

Sentence: The object you lift is a golf club.
Time: 540.3254366666666
Sentence: The object you lift is a golf club.
Time: 541.4301566666667
Sentence: The object you lift is a golf club.
Time: 544.6587366666666
Sentence: The object you lift is a golf club.
Time: 548.5609166666667
Sentence: The object you lift is a golf club.
Time: 551.5269866666666
Sentence: The object you lift is a golf club.
Time: 585.1917866666666
Sentence: The object you lift is a golf club.
Time: 589.7279466666666
Sentence: The object you lift is a golf club.
Time: 609.1992066666667
Sentence: The object you lift is a golf club.
Time: 661.8628866666667
Sentence: The object you lift is a golf club.
Time: 671.2641066666666
Sentence: The object you lift is a golf club.
Time: 680.7280166666667
Sentence: The object you lift is a golf club.
Time: 691.8049766666667
Sentence: The object you lift is a golf club.
Time: 740.0249166666666
Sentence: The object you lift is a golf club.
Time: 756.9704666666667
Sentence: The object