In [2]:
import os
animal_videos = ["bear", "blackswan", "camel", "cows", "dog", "dog-agility", "elephant", "flamingo", "goat", "horsejump-high", "horsejump-low", "mallard-fly", "mallard-water", "rhino"]
path = "/mnt/fastdata/DAVIS16"
# for animal in animal_videos:
#     print(os.listdir(os.path.join(path, animal)))

In [3]:
from PIL import Image
import os
import imageio.v2 as imageio
import numpy as np
import random
import cv2

rgb_dir = "./video_rgb_negative"
alpha_dir = "./video_alpha_negative"

if os.path.exists(rgb_dir):
    os.system(f"rm -rf {rgb_dir}")
os.makedirs(rgb_dir)

if os.path.exists(alpha_dir):
    os.system(f"rm -rf {alpha_dir}")
os.makedirs(alpha_dir)

# for video in os.listdir(path):
def process_video_normal(video):
    frame_dir = os.path.join(path, "JPEGImages", video)
    gt_dir = os.path.join(path, "Annotations", "480p", video)
    if not (os.path.isdir(frame_dir) and os.path.isdir(gt_dir)):
        return
    frames = sorted(f for f in os.listdir(frame_dir) if f.lower().endswith(".jpg"))
    if len(frames) < 37:
        print(f"Video {video} has less than 37 frames, skipping...")
        return

    for idx in range(len(frames) // 37):
        seg_frames = frames[idx * 37:(idx + 1) * 37]

        rgb_out_path = os.path.join(rgb_dir, f"{video}_{idx:03d}.mp4")
        writer_rgb = imageio.get_writer(rgb_out_path, fps=24)
        for i, f in enumerate(seg_frames):
            img = imageio.imread(os.path.join(frame_dir, f))
            img_resized = Image.fromarray(img).resize((576, 320))
            writer_rgb.append_data(np.array(img_resized).astype(np.uint8))
        writer_rgb.close()

        alpha_out_path = os.path.join(alpha_dir, f"{video}_{idx:03d}.mp4")
        writer_alpha = imageio.get_writer(alpha_out_path, fps=24)
        for i, f in enumerate(seg_frames):
            alpha_name = os.path.splitext(f)[0] + '.png'
            mask = np.array(imageio.imread(os.path.join(gt_dir, alpha_name)))
            mask_resized = cv2.resize(mask, (576, 320), interpolation=cv2.INTER_NEAREST)
            img = np.array(imageio.imread(os.path.join(frame_dir, f)))
            img = cv2.resize(img, (576, 320), interpolation=cv2.INTER_NEAREST)
            foreground = np.where(mask_resized[:,:,None] > 0, img, 0).astype(int)
            foreground = np.clip(foreground, 0, 255).astype(np.uint8)
            
            img_clipped = np.where(mask_resized[:,:,None] > 0, 255, 0)
                        
            writer_alpha.append_data(img_clipped.astype(np.uint8))
        writer_alpha.close()
        
    return img_clipped.astype(int), np.array(img_resized).astype(int)


def process_video_highlight(video):
    frame_dir = os.path.join(path, "JPEGImages", video)
    gt_dir = os.path.join(path, "Annotations", "480p", video)
    if not (os.path.isdir(frame_dir) and os.path.isdir(gt_dir)):
        return
    frames = sorted(f for f in os.listdir(frame_dir) if f.lower().endswith(".jpg"))
    if len(frames) < 37:
        print(f"Video {video} has less than 37 frames, skipping...")
        return

    for idx in range(len(frames) // 37):
        seg_frames = frames[idx * 37:(idx + 1) * 37]

        rgb_out_path = os.path.join(rgb_dir, f"hl_{video}_{idx:03d}.mp4")
        writer_rgb = imageio.get_writer(rgb_out_path, fps=24)
        factor = random.randint(1, 3)
        for i, f in enumerate(seg_frames):
            img = imageio.imread(os.path.join(frame_dir, f))
            img_resized = Image.fromarray(img).resize((576, 320))
            img_resized = np.array(img_resized).astype(int)
            mask = np.array(imageio.imread(os.path.join(gt_dir, os.path.splitext(f)[0] + '.png')))
            mask_resized = cv2.resize(mask, (576, 320), interpolation=cv2.INTER_NEAREST)
            img_resized = np.where(mask_resized[:,:,None] > 0, img_resized * factor, img_resized).astype(int)            
            img_resized = np.clip(img_resized, 0, 255).astype(np.uint8)
            writer_rgb.append_data(np.array(img_resized).astype(np.uint8))
            
        writer_rgb.close()

        alpha_out_path = os.path.join(alpha_dir, f"hl_{video}_{idx:03d}.mp4")
        writer_alpha = imageio.get_writer(alpha_out_path, fps=24)
        for i, f in enumerate(seg_frames):
            alpha_name = os.path.splitext(f)[0] + '.png'
            mask = np.array(imageio.imread(os.path.join(gt_dir, alpha_name)))
            mask_resized = cv2.resize(mask, (576, 320), interpolation=cv2.INTER_NEAREST)
            img = np.array(imageio.imread(os.path.join(frame_dir, f)))
            img = cv2.resize(img, (576, 320), interpolation=cv2.INTER_NEAREST)
            foreground = np.where(mask_resized[:,:,None] > 0, img, 0).astype(int)
            foreground = np.clip(foreground, 0, 255).astype(np.uint8)
            
            img_clipped = np.where(mask_resized[:,:,None] > 0, 255, 0)
                        
            writer_alpha.append_data(img_clipped.astype(np.uint8))
        writer_alpha.close()
        
    return img_clipped.astype(int), np.array(img_resized).astype(int)


def process_video(video):
    process_video_highlight(video)
    process_video_normal(video)

In [4]:
from multiprocessing import Pool
pool = Pool(processes=64)
pool.map(process_video, animal_videos)
pool.close()
pool.join()
# was thinking only loss on alpha, no it should be the other way

Video dog-agility has less than 37 frames, skipping...
Video dog-agility has less than 37 frames, skipping...


In [6]:
from openai import OpenAI
from PIL import Image
import io
import base64
from pydantic import BaseModel
from typing import List, Optional

class Prompt(BaseModel):
    pos_prompt: str
    
def capture_image(imgs, filename):
    
    client = OpenAI()
    
    base64s = []
    for img in imgs:
        buffer = io.BytesIO()
        img.save(buffer, format="JPEG")
        image_data = buffer.getvalue()
        base64_image = base64.b64encode(image_data).decode('utf-8')
        image_url = f"data:image/jpeg;base64,{base64_image}"
        base64s.append(image_url)
        
    response = client.beta.chat.completions.parse(
    model="gpt-4.1",
    messages=[
        {
        "role": "user",
        "content": [
            *[{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
            } for image_url in base64s],
            {
            "type": "text",
            "text": "Describe in 2-3 setence this image such that the prompt will be used to generate an video. Mention motion. the filename is " + filename + ", filename might has other things than the animal. Do not mention filename just the animal name. The object is clearly stand out and maybe highlighted (if the filename has 'hl_' at the begining, if so mention this) and standout into the envirement (mention this always). Mention the animal's motion, but do not mention this makes it easy to spot. Use simple language and vocab besides the name of the animal" 
            }
        ]
        }, 
    ],
    response_format=Prompt,
    )
    return response.choices[0].message.content

import os
import cv2
videos = os.listdir("./video_rgb_negative")

def get_prompt(video):
    video_path = os.path.join("./video_alpha_negative", video)
    
    cap = cv2.VideoCapture(video_path)
    frames = []
    # get first, middle, and last frame
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    middle_frame = total_frames // 2
    for i in [0, middle_frame, total_frames - 1]:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frames.append(frame)
    cap.release()
    # Convert frames to PIL images
    imgs = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames]
    # Capture image and get prompt
    prompt = capture_image(imgs, video) 
    
    with open(os.path.join("./video_rgb_negative", video.replace(".mp4", ".txt")), "w") as f:
        f.write(prompt)
    print(prompt)
    return prompt
videos = os.listdir("./video_rgb_negative")
print(len(videos))
# use a pool of 30 processes
from multiprocessing import Pool
if __name__ == "__main__":
    with Pool(30) as p:
        prompts = p.map(get_prompt, videos)
    print(prompts)

# get_prompt("crab_1_000.mp4")

84


{"pos_prompt":"A dog stands out clearly against its environment. The dog is running across a field, its fur moving with the wind as it moves quickly and gracefully."}
{"pos_prompt":"A goat stands out clearly against its environment, making it the central focus of the video. The goat moves slowly, occasionally turning its head and taking a few steps forward."}
{"pos_prompt":"A cow is clearly standing out and highlighted in its environment. The cow is slowly walking across a green field. The background remains still as the cow moves forward."}{"pos_prompt":"A black swan stands out against its surroundings, with its dark feathers creating a strong contrast to the environment. The swan gracefully glides across the water, leaving gentle ripples behind as it moves."}

{"pos_prompt":"A bear stands out clearly against its environment. The bear slowly walks forward, its fur moving gently with each step. The video captures the bear as it moves across the landscape."}
{"pos_prompt":"A bear stands