In [1]:
import os
import gc
import glob
import math
import json
from multiprocessing import Pool

import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option("display.max_columns",100)

In [2]:
SAVE_ROOT = "../data/v_and_l_data_224x224_video/"
SAVE_IMAGE_ROOT = os.path.join(SAVE_ROOT, "images")
SAMPLING_INTERVAL = 15 # frame, (video is 30 FPS, so 15 means sampling images by 2 Hz)

os.makedirs(SAVE_ROOT, exist_ok=True)
os.makedirs(SAVE_IMAGE_ROOT, exist_ok=True)

# resize frame
HEIGHT = 224
WIDTH = 224

In [3]:
# DataFrameを読み込む
df_caption_video_path = pd.read_csv('../data/df_caption_video_path.csv')

In [4]:
df_caption_video_path.head(2)

Unnamed: 0,mov_path,video_name,split,Input.Video,Answer.1start,Answer.1end,Answer.1action,Answer.1justification,Answer.2start,Answer.2end,Answer.2action,Answer.2justification,Answer.3start,Answer.3end,Answer.3action,Answer.3justification,Answer.4start,Answer.4end,Answer.4action,Answer.4justification,Answer.5start,Answer.5end,Answer.5action,Answer.5justification,Answer.6start,Answer.6end,Answer.6action,Answer.6justification,Answer.7start,Answer.7end,Answer.7action,Answer.7justification,Answer.8start,Answer.8end,Answer.8action,Answer.8justification,Answer.9start,Answer.9end,Answer.9action,Answer.9justification,Answer.10start,Answer.10end,Answer.10action,Answer.10justification,Answer.11start,Answer.11end,Answer.11action,Answer.11justification,Answer.12start,Answer.12end,Answer.12action,Answer.12justification,Answer.13start,Answer.13end,Answer.13action,Answer.13justification,Answer.14start,Answer.14end,Answer.14action,Answer.14justification,Answer.15start,Answer.15end,Answer.15action,Answer.15justification
0,/mnt/disks/disk0/bddx_video/video/bdd100k/vide...,16ee2f26-d1308565.mov,train,https://s3-us-west-2.amazonaws.com/sq8geewpqu/...,0,6.0,The car slows down,because it's making a right turn.,7.0,10.0,The car heads down the street,because the street is empty.,11.0,12.0,The car slows down,because there's a stop sign.,13.0,14.0,The car accelerates,because it's turning to the right.,15.0,20.0,The car heads down the street,because the street is empty.,21.0,24.0,The car slows down slightly,because it's turning to the left.,25.0,30.0,The car heads down the street,because the street is clear.,31.0,35.0,The car slows down,because it's turning to the right.,36.0,40.0,The car heads down the street,because the street is clear.,,,,,,,,,,,,,,,,,,,,,,,,
1,/mnt/disks/disk0/bddx_video/video/bdd100k/vide...,25c65ad4-2acd7459.mov,train,https://s3-us-west-2.amazonaws.com/sq8geewpqu/...,0,18.0,The car is driving reasonably fast in the righ...,because traffic is light and conditions are fa...,19.0,25.0,The car makes a fast right turn,because the light is green and the turn is clear,26.0,40.0,The car reduces speed and continues forward sl...,perhaps looking for the destination.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
def get_caption_info(row, caption_index):
    start_time = row[f"Answer.{caption_index}start"]
    end_time = row[f"Answer.{caption_index}end"]
    action = row[f"Answer.{caption_index}action"]
    justification = row[f"Answer.{caption_index}justification"]
    return start_time, end_time, action, justification

# Get and Save Frames

In [6]:
def get_and_save_frames(i):
    img_path_list = []
    action_list = []
    justification_list = []

    row = df_caption_video_path.iloc[i].fillna("no_data")

    cap = cv2.VideoCapture(row.mov_path)
    cap_name = row.video_name.split('.')[0]

    # movの中で何個目のcaptionか
    caption_index = 1
    start_time, end_time, action, justification = get_caption_info(row, caption_index)

    # videoの中の何フレーム目かを保持, FPS30なので30のときに1秒
    video_frame_cnt = 0

    # captionが対応する時間に達しているかどうかを判定する
    reach_caption_time = True

    if action != "no_data":
        while True:
            ret, frame = cap.read()

            if ret:
                # save all images
                img_name = f"{cap_name}_{video_frame_cnt:0>5}.jpg"
                img_path = os.path.join(SAVE_IMAGE_ROOT, img_name)

                # resize images
                frame = cv2.resize(frame, (WIDTH, HEIGHT))
                if os.path.exists(img_path) == False:
                    cv2.imwrite(img_path, frame)


                # SAMPLING_INTERVALのときに画像やCaptionを保存
                if video_frame_cnt % SAMPLING_INTERVAL == 0 and ret == True:
                    # captionがまた始まっていない場合はスキップ
                    if reach_caption_time:
                        img_path_list.append(img_name)
                        action_list.append(action)
                        justification_list.append(justification)

                # captionの時間が過ぎていたら新しいCaptionを取得
                if end_time < video_frame_cnt / 30:
                    caption_index += 1
                    start_time, end_time, action, justification = get_caption_info(row, caption_index)

                    # Captionがあるのに時間が記載されていない時は前後の時間を入れる
                    if action != "no_data" and start_time == "no_data":
                        _, start_time, _, _ = get_caption_info(row, caption_index-1)
                    if action != "no_data" and end_time == "no_data":
                        end_time, _, _, _ = get_caption_info(row, caption_index+1)
                        # 次のCaptionのend_timeもno_dataのときは最後までいく
                        if end_time == "no_data":
                            end_time = 40.0

                # もしactionがNoneだったらこの動画は終了する
                if action == "no_data":
                    break

                # videoのFrameを1つ進める
                video_frame_cnt += 1

                # 次のcaptionが所定の時間内にあるかを確認、FalseのときまだCaptionはスタートしていない
                if start_time <= video_frame_cnt / 30:
                    reach_caption_time = True
                else:
                    reach_caption_time = False
            else:
                break
    del cap
    gc.collect()
    return img_path_list, action_list, justification_list

In [7]:
img_path_list = []
action_list = []
justification_list = []

pool = Pool(processes=10)

# indeces = range(100)
indeces = range(len(df_caption_video_path))

with tqdm(total=len(indeces)) as t:
    for ret in pool.imap(get_and_save_frames, indeces):
        img_path_list.extend(ret[0])
        action_list.extend(ret[1])
        justification_list.extend(ret[2])
        t.update(1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5423/5423 [1:34:20<00:00,  1.04s/it]


In [8]:
df_v_and_l = pd.DataFrame({
    "img_path": img_path_list,
    "action": action_list,
    "justification": justification_list,
})
df_v_and_l.to_csv(os.path.join(SAVE_ROOT, 'df_v_and_l.csv'), index=False)

# Make json

In [9]:
# split train/val/test
with open('../data/BDD-X-Dataset/train.txt', "r") as f:
    train_txt = f.readlines()
train_txt = [i.replace('\n', '').split("_")[1] for i in train_txt]

with open('../data/BDD-X-Dataset/val.txt', "r") as f:
    val_txt = f.readlines()
val_txt = [i.replace('\n', '').split("_")[1] for i in val_txt]

with open('../data/BDD-X-Dataset/test.txt', "r") as f:
    test_txt = f.readlines()
test_txt = [i.replace('\n', '').split("_")[1] for i in test_txt]

split_list = []
for i in tqdm(range(len(df_v_and_l))):
    row = df_v_and_l.iloc[i]
    name = row.img_path.split("_")[0]
    if name in train_txt:
        split_list.append("train")
    elif name in val_txt:
        split_list.append("val")
    elif name in test_txt:
        split_list.append("test")
    else:
        split_list.append("train")

# update dataframe
df_v_and_l["split"] = split_list
df_v_and_l.split.value_counts()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 372517/372517 [00:35<00:00, 10568.88it/s]


train    297096
test      38689
val       36732
Name: split, dtype: int64

In [10]:
def extract_json_data(df_interest):
    result_list = []
    for i in tqdm(range(len(df_interest))):
        row = df_interest.iloc[i]

        data =   {
            "id": row.img_path.split('.')[0],
            "image": row.img_path,
            "conversations": [
              {
                "from": "human",
                "value": "Describe what the driver is doing and why is the driver doing that.\n<image>"
              },
              {
                "from": "gpt",
                "value": f"{row.action} {row.justification}"
              }
            ]
        }
        result_list.append(data)
    return result_list

In [11]:
train_list = extract_json_data(df_v_and_l[df_v_and_l.split == "train"])
val_list = extract_json_data(df_v_and_l[df_v_and_l.split == "val"])
test_list = extract_json_data(df_v_and_l[df_v_and_l.split == "test"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297096/297096 [00:17<00:00, 16616.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36732/36732 [00:02<00:00, 15880.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38689/38689 [00:02<00:00, 17468.79it/s]


In [12]:
with open(os.path.join(SAVE_ROOT, 'train.json'), "w") as f:
    json.dump(train_list, f)
    
with open(os.path.join(SAVE_ROOT, 'val.json'), "w") as f:
    json.dump(val_list, f)
    
with open(os.path.join(SAVE_ROOT, 'test.json'), "w") as f:
    json.dump(test_list, f)

In [None]:
df