In [1]:

from transformers import AutoModel, AutoTokenizer
import torch
import json
import os
import tqdm
import decord
from decord import VideoReader
decord.bridge.set_bridge("torch")
import numpy as np
import math
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import math

import spacy
nlp = spacy.load("en_core_web_sm")

def sentene2verb(sentence):
    
    doc = nlp(sentence)
    verbs = []
    for token in doc:
        if token.pos_ == "VERB":
            verb_phrase = token.lemma_
            verbs.append(verb_phrase)
    return verbs

def sentene2n(sentence):

    doc = nlp(sentence)
    verbs = []
    for token in doc:
        if token.pos_ == "NOUN":
            verb_phrase = token.lemma_
            verbs.append(verb_phrase)
    return verbs

def ceil_time_by_fps(time: float, fps: int, min_time: float, max_time: float):
    return min(max(math.ceil(time * fps) / fps, min_time), max_time)

def show_image(load_range, frames, output_path=None):
    frames_per_row = 7

    # 计算行数
    rows = math.ceil(len(load_range) / frames_per_row)

    # 创建子图
    fig, axes = plt.subplots(rows, frames_per_row, figsize=(frames_per_row * 4, rows * 4))

    # 将 frames 绘制到子图中
    for i in range(len(load_range)):
        row = i // frames_per_row
        col = i % frames_per_row
        if rows == 1:
            axes[col].imshow(frames[i])
            axes[col].axis('off')
            axes[col].set_title(f"Frame {i}")
        else:
            axes[row, col].imshow(frames[i])
            axes[row, col].axis('off')
            axes[row, col].set_title(f"Frame {i}")

    # 如果最后一行有空的子图格子，关闭它们
    for i in range(len(load_range), rows * frames_per_row):
        fig.delaxes(axes.flatten()[i])

    if output_path is not None:
        plt.savefig(output_path)
    else:
        plt.tight_layout()
        plt.show()


class AnnotationLoader:
    def __init__(self, train_path, val_path, origin_path):
        self.train_data = json.load(open(train_path))
        self.val_data = json.load(open(val_path))
        self.data = {**self.train_data, **self.val_data}
        
        self.origin_narration = json.load(open(origin_path))['videos']
        
    def get_data(self):
        return self.data
    
    def get_origin_narration(self):
        return self.origin_narration

class BetaAlphaCalculator:
    def __init__(self, data, alpha=4.9):
        self.data = data
        self.beta_map = {}
        self.alpha = alpha
    
    def compute_beta(self):
        for video_uid, annotation_uid_narrations in self.data.items():
            for annotation_uid, narrations in annotation_uid_narrations.items():
                if len(narrations) == 0:
                    continue
                total_time = 0
                for i in range(len(narrations) - 1):
                    total_time += narrations[i+1]['time'] - narrations[i]['time']
                self.beta_map[annotation_uid] = total_time / len(narrations)
    
    def get_beta_map(self):
        return self.beta_map
    
    def get_alpha(self):
        return self.alpha

class VideoProcessor:
    def __init__(self, data, origin_narration, beta_map, alpha, video_root, frame_fps=2):
        self.data = data
        self.origin_narration = origin_narration
        self.beta_map = beta_map
        self.alpha = alpha
        self.video_root = video_root
        self.frame_fps = frame_fps
        
        from siglip import visionTextAligner
        self.aliger = visionTextAligner()
    
    
    def load_scene_clipv2(self, path, clip_idx, max_frame=32,):

        annotation_uids = list(self.data[path].keys())
        clip_id = annotation_uids[clip_idx]
        
        # load clip
        summs = self.origin_narration[path]['summaries']
        for summ in summs:
            if summ['_annotation_uid'] == clip_id:
                break
            
        start_time, end_time = summ['start_time'], summ['end_time']
        vr = VideoReader(uri=os.path.join(self.video_root, path) + '.mp4')
        start_frame = int(ceil_time_by_fps(start_time, self.frame_fps, 0, vr._num_frame / self.frame_fps) * self.frame_fps)
        end_frame = int(ceil_time_by_fps(end_time, self.frame_fps, 0, vr._num_frame / self.frame_fps)* self.frame_fps) + 1
        load_range = range(start_frame, end_frame)
        frames = vr.get_batch(load_range)
        
        # vision simi
        simi = self.aliger.vision_simi(frames)
        frames = vr.get_batch(load_range)
        frames = [Image.fromarray(v.astype('uint8')) for v in frames.numpy()]
        if simi > 0.8:
            if len(frames) > max_frame:
                # uniformly sample frames
                step = math.ceil(len(frames) / max_frame)
                frames = frames[::step]
                # save frame info
                
                frames = VideoProcessor.add_frame_info(frames, start_frame, self.frame_fps / step)
                
                load_range = range(0,len(frames))
                yield frames, start_frame, end_frame, load_range, (self.frame_fps / step), simi
        elif simi < 0.6:
            for i in range(0, len(frames), max_frame):
                r_f = frames[i:i+max_frame]
                
                r_f = VideoProcessor.add_frame_info(r_f, start_time + i / self.frame_fps, self.frame_fps)
                yield r_f, start_frame + i, start_frame + (i + len(r_f)), range(0,len(r_f)), self.frame_fps, simi
        else:
            step = 1
            if len(frames) > max_frame:
                # uniformly sample frames
                frames = frames[::self.frame_fps*2]
                step = (self.frame_fps*2)
            for i in range(0, len(frames), max_frame):
                r_f = frames[i:i+max_frame]
                
                r_f = VideoProcessor.add_frame_info(r_f, start_time + i / self.frame_fps, self.frame_fps / step)
                yield r_f, start_frame + i, start_frame + (i + len(r_f)) / self.frame_fps, range(0,len(r_f)), (self.frame_fps / step), simi

    @staticmethod
    def add_frame_info(frames, start_frame, frame_fps):
        """
        在每一帧左上角添加时间或帧编号信息。
        
        参数:
            frames (list): 视频帧列表，每一帧是一个 PIL Image 对象。
            start_frame (int): 起始帧编号。
            frame_fps (int): 视频的帧率。
        
        返回:
            list: 添加了信息的帧列表。
        """
        font = ImageFont.truetype("/root/videollm-online/data/preprocess/font/ARIAL.TTF", size=100) 
        annotated_frames = []
        for idx, frame in enumerate(frames):
            current_frame = start_frame + idx
            time_seconds = current_frame / frame_fps
            time_text = f"{time_seconds:.2f}s"  # 格式化为秒的小数形式
            frame_text = f"Frame {current_frame}"  # 显示帧编号
            

            draw = ImageDraw.Draw(frame)
            draw.text((10, 10), time_text, fill="red", font=font)
            draw.text((10, 200), frame_text, fill="red", font=font)
            
            annotated_frames.append(frame)
        
        return annotated_frames
            

class CaptionGenerator:
    def __init__(self, model_name, tokenizer_name, device='cuda:4', dtype=torch.bfloat16):
        self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True,
                                               attn_implementation='sdpa', torch_dtype=dtype)
        self.model.eval()
        self.model.to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
            
    def get_caption(self, frames, question):
        msgs = [
            {'role': 'user', 'content': frames + [question]}, 
        ]
        
        # Set decode params for video
        params={}
        params["use_image_id"] = False
        params["max_slice_nums"] = 2 # use 1 if cuda OOM and video resolution >  448*448

        answer = self.model.chat(
            image=None,
            msgs=msgs,
            tokenizer=self.tokenizer,
            **params
        )
        return question, answer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import time
    
time1 = time.time()

train_path = '/root/videollm-online/datasets/ego4d/v2/annotations/refined_narration_stream_train.json'
val_path = '/root/videollm-online/datasets/ego4d/v2/annotations/refined_narration_stream_val.json'
origin_path = '/root/videollm-online/datasets/ego4d/v2/annotations/all_narrations_redacted.json'
video_root = '/root/videollm-online/datasets/ego4d/v2/full_scale_2fps'
video2scene = json.load(open('/root/videollm-online/data/preprocess/metafile/video2scene.json'))
video_uid_list = open('/root/videollm-online/data/preprocess/metafile/major2scene_case.txt').read().split('\n')
alpha = 4.9
device = 'cuda:5'

# 初始化各个模块
annotation_loader = AnnotationLoader(train_path, val_path, origin_path)
data = annotation_loader.get_data()
origin_narration = annotation_loader.get_origin_narration()

beta_alpha_calculator = BetaAlphaCalculator(data, alpha)
beta_alpha_calculator.compute_beta()
beta_map = beta_alpha_calculator.get_beta_map()
alpha = beta_alpha_calculator.get_alpha()

video_processor = VideoProcessor(data, origin_narration, beta_map, alpha, video_root)
caption_generator = CaptionGenerator('openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-V-2_6', device=device)

print(f'Initialization time: {time.time() - time1:.2f}s, start captioning...')


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]


Initialization time: 188.80s, start captioning...


In [9]:
class PromptGenerator:
    def __init__(self, prompt_file):
        self.prompt = open(prompt_file).read()
    
    
class PromptGeneratorQA(PromptGenerator):
    def __init__(self, prompt_file):
        super().__init__(prompt_file)
    
    def get_prompt(self, question):
        return self.prompt.format(question)

from tqdm import tqdm

output_dir = 'qa_output'
path = '000cd456-ff8d-499b-b0c1-4acead128a8b'
prompt_file = '/root/videollm-online/data/preprocess/prompt/QA_time.txt'
clip_idx = 0
annotation_uid_narrations = data[path]
annotation_uid = list(annotation_uid_narrations.keys())[clip_idx]

subject = ' / '.join(video2scene[path])
clip_gen = video_processor.load_scene_clipv2(path, clip_idx)

origin_question = "What is the distinctive feature of the white cat ?"



In [10]:
origin_question = "Can you identify the time segment in the video where I started to interact with the cats using colorful feathered toy attached to a black stick? (You can refer to the timestamp and frame number in the upper-left corner)"
prompt_genrator = PromptGeneratorQA(prompt_file)

for action_idx, (frames, start_frame, end_frame, load_range, fps, simi) in tqdm(enumerate(clip_gen)):
    # os.makedirs(f'{output_dir}/{path}/{annotation_uid}', exist_ok=True)
    # show_image(load_range, frames, f'{output_dir}/{path}/{annotation_uid}/{action_idx}.png')

    question = prompt_genrator.get_prompt(origin_question)
    print(question)
    question, answer = caption_generator.get_caption(frames, origin_question)

    print(answer)
    # with open(f'{output_dir}/{path}/{annotation_uid}/{action_idx}.txt', 'w') as f:
    #     f.write(question + '\n')
    #     f.write(answer)
    break

0it [00:00, ?it/s]

You will receive a video input where each frame displays a timestamp and the frame number in the top-left corner. I will ask you questions about the video, and you need to answer them. When relevant, identify the first frame number and timestamp where the answer is visually supported in the video.

Examples:

Question: What colour is the coat of the man on my right?
Answer: The color of the coat is blue. The answer appears for the first time in frame 10. 

Question: What is the primary object the person is using to interact with the cats in the video? 
Answer: It is a colorful feathered toy attached to a black stick. The answer appears for the first time in frame 50.

Your Turn:

Question: Can you identify the time segment in the video where I started to interact with the cats using colorful feathered toy attached to a black stick?
Answer: 



0it [05:46, ?it/s]

Yes, the video shows that you started interacting with the cats using a colorful feathered toy attached to a black stick around 3 seconds into the footage.





In [11]:
origin_question = "Can you identify the time segment in the video where I started to interact with the cats using colorful feathered toy attached to a black stick? (You can refer to the timestamp and frame number in the upper-left corner)"
question = prompt_genrator.get_prompt(origin_question)
print(question)
question, answer = caption_generator.get_caption(frames, question)

print(answer)
# with open(f'{output_dir}/{path}/{annotation_uid}/{action_idx}.txt', 'w') as f:
#     f.write(question + '\n')
#     f.write(answer)

You will receive a video input where each frame displays a timestamp and the frame number in the top-left corner. I will ask you questions about the video, and you need to answer them. When relevant, identify the first frame number and timestamp where the answer is visually supported in the video.

Examples:

Question: What colour is the coat of the man on my right?
Answer: The color of the coat is blue. The answer appears for the first time in frame 10. 

Question: What is the primary object the person is using to interact with the cats in the video? 
Answer: It is a colorful feathered toy attached to a black stick. The answer appears for the first time in frame 50.

Your Turn:

Question: Can you identify the time segment in the video where I started to interact with the cats using colorful feathered toy attached to a black stick? (You can refer to the timestamp and frame number in the upper-left corner)
Answer: 

The first frame where the person starts interacting with the cats using