# merge narration and action caption

In [2]:
import json
import os
import tqdm

from video_caption import AnnotationLoader, BetaAlphaCalculator
train_path = '/root/videollm-online/datasets/ego4d/v2/annotations/refined_narration_stream_train.json'
val_path = '/root/videollm-online/datasets/ego4d/v2/annotations/refined_narration_stream_val.json'
origin_path = '/root/videollm-online/datasets/ego4d/v2/annotations/all_narrations_redacted.json'
video_root = '/root/videollm-online/datasets/ego4d/v2/full_scale_2fps'
alpha = 4.9
device = 'cuda:3'
video_uid_list = open('/root/videollm-online/data/preprocess/metafile/major2scene_case.txt').read().split('\n')
caption_dir = '/root/videollm-online/tmp5'

annotation_loader = AnnotationLoader(train_path, val_path, origin_path)
data = annotation_loader.get_data()
origin_narration = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(data, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

In [26]:
def merge_caption_from_dir(caption_dir, narrations, output_dir):
    caption_texts = ""
    for action_idx, narration in enumerate(narrations):
        action_narration = 'Time is {}. Action narration is \"'.format(narration['time']) + narration['text'] + '\".\n'
        
        file_path = os.path.join(caption_dir, str(action_idx) + '.txt')
        
        with open(file_path, 'r') as f:
            lines = f.readlines()
            # Find the index of the line starting with "1. Relevant objects:"
            # Extract lines from start_index onwards
            caption_text = ''.join(lines[34:])
            
            caption_text = action_narration + "Detailed Description: " +caption_text
            caption_texts += caption_text + '\n\n'
            
    print(caption_texts)
    with open(os.path.join(output_dir, f'{os.path.basename(caption_dir)}.txt'), 'w') as f:
        f.write(caption_texts)
    return caption_texts
            

caption_dir = '/root/videollm-online/tmp5'

for path in tqdm.tqdm(video_uid_list):
    if path not in data:
            continue
    annotation_uid_narrations = data[path]
    for clip_idx, (annotation_uid, narrations) in enumerate(annotation_uid_narrations.items()):
        caption = merge_caption_from_dir(os.path.join(caption_dir, path, annotation_uid), narrations, os.path.join(caption_dir, path))
        break



  0%|          | 0/121 [00:00<?, ?it/s]

 56%|█████▌    | 68/121 [00:00<00:00, 347.21it/s]


Time is 0.0. Action narration is "You grab the shears.".
Detailed Description: With your right hand, you reach down to pick up a pair of green-handled shears lying on the grass. The metal blades gleam in the sunlight as you grasp them firmly by their handles, preparing for the task at hand.

Time is 3.71883. Action narration is "You trim the tree.".
Detailed Description: With a pair of large yellow-handled pruning shears in hand, you carefully trim the tree situated on the lawn. The green leaves and branches are neatly cut as your hands move with precision. The shears glide smoothly over the foliage, creating clean lines that define the shape of the tree. Sunlight filters through the leaves, casting dappled shadows on the grass around you.


Time is 2.38301. Action narration is "You dip your right hand into the lawnmower engine.".
Detailed Description: With your right hand, you carefully dip into the lawnmower engine. The yellow body of the lawnmower stands out against the backdrop of 

FileNotFoundError: [Errno 2] No such file or directory: '/root/videollm-online/tmp5/0b530687-26d8-4c9d-9771-c758ecd2ecbf/22c2eaad-e05c-4f72-86ad-18f43fd55c06/0.txt'

# merge scene caption

In [5]:
import json
import os
import tqdm
from dataclasses import dataclass
from tqdm import tqdm



class captionMerger:
    def __init__(self, device='cuda:4', prompt_file='/root/videollm-online/data/preprocess/prompt/caption_merge.txt') -> None:
        from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
        self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct', use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct', torch_dtype='auto', attn_implementation='sdpa')
        self.model.to(device)
        self.model.eval()
        self.prompt = open(prompt_file, 'r').read()
        
    
    def merge_wlast(self, caption, last_caption):
        conversation = [
            {'role': 'user', 'content': self.prompt.format(caption, last_caption)},
        ]
        print(conversation)

        input_ids = self.tokenizer.apply_chat_template(conversation, return_tensors='pt', add_generation_prompt=True).cuda()
        output_ids = self.model.generate(input_ids, max_length=8192)[:,input_ids.size(1):]
        answer = self.tokenizer.decode(output_ids[0])
        print(answer)
        return answer
    
    def merge(self, caption_list):
        old_caption  = caption_list[0]
        for last_cap in caption_list[1:]:
            old_caption = self.merge_wlast(old_caption, last_cap)
        return old_caption


caption_merger = captionMerger()


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  8.10it/s]


In [6]:
class captionLoader:
    def __init__(self, caption_dir) -> None:
        self.caption_dir = caption_dir
    
    def load(self, vdieo_uid, clip_idx):
        # read txt
        caption_list = []
        for file in sorted(os.listdir(os.path.join(self.caption_dir, vdieo_uid, clip_idx))):
            if file.endswith('.txt'):
                with open(os.path.join(self.caption_dir,vdieo_uid,clip_idx,file), 'r') as f:
                    caption_list.append(f.readlines()[1:])
        return caption_list
    
class pipelineMain:
    def __init__(self, caption_dir, caption_merger, caption_loader) -> None:
        self.caption_merger = caption_merger
        self.caption_loader = caption_loader
        self.caption_dir = caption_dir
    
    def run(self):
        for dir in tqdm(sorted(os.listdir(self.caption_dir))):
            for file in sorted(os.listdir(os.path.join(self.caption_dir, dir))):
                caption_list = self.caption_loader.load(dir, file)
                print(os.path.join(self.caption_dir, dir, file, 'caption_list.json'))
                json.dump(caption_list, open(os.path.join(self.caption_dir, dir, file, 'caption_list.json'), 'w'), indent=4)
                merged_caption = self.caption_merger.merge(caption_list)
                print(merged_caption)
                with open(os.path.join(self.caption_dir, dir, file, 'merged.txt'), 'w') as f:
                    f.write(merged_caption)

caption_dir = '/root/videollm-online/data/preprocess/tmp3'
caption_loader = captionLoader(caption_dir)
p = pipelineMain(caption_dir, caption_merger, caption_loader)
p.caption_loader = captionLoader(caption_dir)
p.run()

  0%|          | 0/62 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/62 [00:00<?, ?it/s]


/root/videollm-online/data/preprocess/tmp3/000cd456-ff8d-499b-b0c1-4acead128a8b/0/caption_list.json
[{'role': 'user', 'content': 'You are an expert at merging video captions. Your task is to combine the previous caption summary with the current segment\'s caption into a smooth and coherent description. Ensure the merged text flows naturally and maintains overall consistency.\n\n---\n\nPrevious merged summary:\n[\'The video depicts a sequence of interactions between a person and a cat in an indoor setting. The individual, dressed in a blue sweater, is seated on the floor with various objects around them including a smartphone, remote controls, a bottle, and a small table with items such as a tablet and coasters. A toy consisting of a colorful ribbon attached to a stick is introduced by the person, who uses it to engage the cat.\\n\', \'\\n\', "Throughout the video, the cat, which appears to be a Siamese or similar breed with light fur and darker facial markings, is seen responding to th

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:4 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

# merge dir json

In [None]:
import os
import json
path = '/2022233235/datasets/ego4d_moveaction_caption/'
version = 'valid_1'
dir = os.path.join(path, version)
action_caption = {}


for v in os.listdir(dir):
    action_caption[v] = {}
    for c in os.listdir(os.path.join(dir, v)):
        if c.endswith('.json'):
            with open(os.path.join(dir, v, c), 'r') as f:
                caption = json.load(f)
                for cap in caption:
                    cap['text'] = cap['caption']
                    cap['time'] = cap['end_time']
                action_caption[v][c.split('.')[0]] = caption

train_ratio = 0.7
import random
train_video_uid = random.sample(list(action_caption.keys()), int(len(action_caption) * train_ratio))
val_video_uid = list(set(action_caption.keys()) - set(train_video_uid)) 
train_action_caption = {}
val_action_caption = {}
for v in train_video_uid:
    train_action_caption[v] = action_caption[v]
for v in val_video_uid:
    val_action_caption[v] = action_caption[v]


with open(os.path.join(path, f'action_caption_train.json'), 'w') as f:
    json.dump(train_action_caption, f, indent=4)

with open(os.path.join(path, f'action_caption_val.json'), 'w') as f:
    json.dump(val_action_caption, f, indent=4)

In [34]:
c = 0
for k ,v in train_action_caption.items():
    c += 1
print(c)

503


In [35]:
import os

path = '/2022233235/datasets/ego4d/full_scale_2fps'
file_list = os.listdir(path)
print(len(file_list))
with open('/2022233235/videollm-online/full_scale_2fps.txt', 'w') as f:
    for file in file_list:
        f.write(file + '\n')

2233
