In [1]:
import torch

In [31]:
import os, io, csv, math, random
import numpy as np
from einops import rearrange
import decord
from decord import VideoReader
import json

import sys
sys.path.append('/home/ubuntu/video-generation/AnimateDiff')
# import torch
import torchvision.transforms as transforms
from torch.utils.data.dataset import Dataset
from animatediff.utils.util import zero_rank_print



class Something_v2(Dataset):
    def __init__(
            self,
            csv_path, video_folder,
            sample_size=256, sample_stride=4, sample_n_frames=16,
            is_image=False,
        ):
        zero_rank_print(f"loading annotations from {csv_path} ...")
        
        self.dataset = []
        with open(csv_path, 'r') as f:
            annotations = json.load(f)

            for item in annotations:
                video_id = item['id']
                template = item['template']
                placeholders = item['placeholders']
                
                prompt = template
                for placeholder in placeholders:
                    prompt = prompt.replace('[something]', placeholder, 1)
                    
                self.dataset.append({"videoid": video_id, "name": prompt, "template": template, "placeholders": placeholders})
            
            
            
        self.length = len(self.dataset)
        zero_rank_print(f"data scale: {self.length}")

        self.video_folder    = video_folder
        self.sample_stride   = sample_stride
        self.sample_n_frames = sample_n_frames
        self.is_image        = is_image
        
        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
        self.pixel_transforms = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.Resize(sample_size[0]),
            transforms.CenterCrop(sample_size),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
        ])
    
    def get_batch(self, idx):
        video_dict = self.dataset[idx]
        videoid, name, _, _ = video_dict['videoid'], video_dict['name'], video_dict['template'], video_dict["placeholders"]
        
        video_dir    = os.path.join(self.video_folder, f"{videoid}.webm")
        video_reader = VideoReader(video_dir, num_threads=1)
        video_length = len(video_reader)
        
        # # Khởi tạo VideoReader
        # try:
        #     vr = VideoReader(video_dir)
        #     print(f"Video có {len(vr)} khung hình")
        #     # Lấy khung hình đầu tiên
        #     frame = vr[0]
        #     print("Frame đầu tiên:", frame.shape)
        # except decord.DECORDError as e:
        #     print(f"Không thể đọc video: {e}")
        
        if not self.is_image:
            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
            start_idx   = random.randint(0, video_length - clip_length)
            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
            print(batch_index)
            print(self.sample_n_frames)
        else:
            batch_index = [random.randint(0, video_length - 1)]

        # print(video_reader[1:4])
        pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).numpy()).permute(0, 3, 1, 2).contiguous()
        pixel_values = pixel_values / 255.
        del video_reader

        if self.is_image:
            pixel_values = pixel_values[0]
        
        return pixel_values, name

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        # while True:
        #     try:
        pixel_values, name = self.get_batch(idx)
                # break

            # except Exception as e:
            #     idx = random.randint(0, self.length-1)

        pixel_values = self.pixel_transforms(pixel_values)
        sample = dict(pixel_values=pixel_values, text=name)
        return sample


In [33]:
dataset = Something_v2(
        csv_path="/home/ubuntu/video-generation/something_something_v2/data/labels/validation.json",
        video_folder="/home/ubuntu/video-generation/something_something_v2/data/20bn-something-something-v2",
        sample_size=256,
        sample_stride=1, sample_n_frames=20,
        is_image=False,
    )

print(dataset[0]['pixel_values'].shape)

[37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56]
20
torch.Size([20, 3, 256, 256])


In [21]:
dataset = Something_v2(
        csv_path="/home/ubuntu/video-generation/something_something_v2/data/labels/validation.json",
        video_folder="/home/ubuntu/video-generation/something_something_v2/data/20bn-something-something-v2",
        sample_size=256,
        sample_stride=4, sample_n_frames=10,
        is_image=True,
    )
# import pdb
# pdb.set_trace()


dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=16,)
for idx, batch in enumerate(dataloader):
    print(batch["pixel_values"].shape, len(batch["text"]))
    # for i in range(batch["pixel_values"].shape[0]):
    #     save_videos_grid(batch["pixel_values"][i:i+1].permute(0,2,1,3,4), os.path.join(".", f"{idx}-{i}.mp4"), rescale=True)


AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/animated/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ubuntu/anaconda3/envs/animated/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/ubuntu/anaconda3/envs/animated/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_31483/3085530526.py", line 100, in __getitem__
    pixel_values, name = self.get_batch(idx)
  File "/tmp/ipykernel_31483/3085530526.py", line 85, in get_batch
    pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
AttributeError: 'Tensor' object has no attribute 'asnumpy'. Did you mean: 'numpy'?


In [None]:
import json

annotation_file = '/home/ubuntu/video-generation/something_something_v2/data/labels/validation.json'
video_dict = []

with open(annotation_file, 'r') as f:
    annotations = json.load(f)

    for item in annotations:
        video_id = item['id']
        template = item['template']
        placeholders = item['placeholders']
        
        prompt = template
        for placeholder in placeholders:
            prompt = prompt.replace('[something]', placeholder, 1)
            
        # print(prompt)
        video_dict.append({"id": video_id, "prompt": prompt, "template": template, "placeholders": placeholders})

print(video_dict[2])


{'id': '198186', 'prompt': 'Wiping words off of a paper', 'template': 'Wiping [something] off of [something]', 'placeholders': ['words', 'a paper']}


In [1]:
import cv2
import os
from tqdm import tqdm

# Đường dẫn đến thư mục chứa video
video_folder = '/home/ubuntu/video-generation/something_something_v2/data/20bn-something-something-v2'

# Biến để đếm tổng số khung hình và số video
total_frames = 0
total_videos = 0
min_frames = 99999999999
max_frames = 0
max_fps = 99999999999
min_fps = 0
# Lặp qua tất cả các file trong thư mục
for id, filename in enumerate(tqdm(os.listdir(video_folder))):
    if id > 50000:
        break
    if filename.endswith(".webm"):
        video_path = os.path.join(video_folder, filename)
        
        # Mở video bằng OpenCV
        cap = cv2.VideoCapture(video_path)
        
        # Kiểm tra xem video có mở được không
        if not cap.isOpened():
            print(f"Không thể mở video: {filename}")
            continue
        
        # Lấy số lượng khung hình trong video
        frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        min_frames = min(min_frames, frames)
        max_frames = max(max_frames, frames)
        
        min_fps = min(min_fps, cv2.CAP_PROP_FPS)
        max_fps = max(max_fps, cv2.CAP_PROP_FPS)
        
        total_frames += frames
        total_videos += 1
        
        # Đóng video sau khi xử lý
        cap.release()

# Tính số khung hình trung bình trên mỗi video
if total_videos > 0:
    average_frames = total_frames / total_videos
    print(f"Trung bình số khung hình trên mỗi video: {average_frames}")
    print(f"Ít frames nhất: {min_frames}")
    print(f"Nhiều frames nhất: {max_frames}")
    print(f"Min FPS: {min_frames}")
    print(f"Max FPS: {max_frames}")

else:
    print("Không có video nào được tìm thấy trong thư mục.")


  0%|          | 381/220847 [00:02<23:20, 157.44it/s]


KeyboardInterrupt: 

In [14]:
import decord
from decord import VideoReader
decord.bridge.set_bridge('torch')  # nếu bạn muốn làm việc với Tensor trong PyTorch

video_path = '/home/ubuntu/video-generation/something_something_v2/data/20bn-something-something-v2/1.webm'

# Khởi tạo VideoReader
try:
    vr = VideoReader(video_path)
    print(f"Video có {len(vr)} khung hình")
    # Lấy khung hình đầu tiên
    frame = vr[0]
    frame = vr.get_batch([50])
    # frame = vr.get_batch([1, 4])
    print("Frame đầu tiên:", frame.shape)
except decord.DECORDError as e:
    print(f"Không thể đọc video: {e}")


Video có 47 khung hình


IndexError: Out of bound indices: [50]

In [36]:
# target_templates = [
#     'Approaching [something] with your camera',
#     'Closing [something]',
#     'Dropping [something]',
#     'Folding [something]',
#     'Holding [something]',
#     'Holding [something] next to [something]',
#     'Moving [something] away from [something]',
#     'Moving [something] away from the camera',
#     'Moving [something] closer to [something]',
#     'Moving [something] down',
#     'Moving [something] from left to right',
#     'Moving [something] from right to left',
#     'Moving [something] towards the camera',
#     'Moving away from [something] with your camera',
#     'Opening [something]',
#     'Picking [something]',
#     'Plugging [something] into [something]',
#     'Poking [something]',
#     'Pouring [something]',
#     'Pushing [something] so that it slightly moves',
# ]

target_templates = [
    'Approaching [something] with your camera',
    'Closing [something]',
    'Dropping [something]',
    'Holding [something] next to [something]',
    'Moving [something] away from [something]',
    'Moving [something] towards the camera',
    'Moving [something] from left to right',
    'Opening [something]',
    'Picking [something]',
    'Pouring [something]'
]

import json
with open('/home/ubuntu/video-generation/something_something_v2/data/labels/train.json', 'r') as file:
    train_data = json.load(file)
    
filtered_data = [entry for entry in train_data if entry["template"] in target_templates]

# Save the filtered data to a new JSON file
with open('/home/ubuntu/video-generation/something_something_v2/data/labels/10_class_train.json', 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)



In [37]:
print(len(filtered_data))

17754
