In [1]:
import pandas as pd
import json

import os

In [None]:
# Load BDD100k-MOT

mot_files = os.listdir('./raw/bdd100k-MOT2020/labels/box_track_20/train/')
mot_files = [x.split('.json')[0] for x in mot_files if type(x) == str]

# Load BDD-X 
with open('../data/BDD-X-Dataset/train.txt', 'r') as f:
    bddx_files = f.readlines()
    bddx_files = [line.strip().split('_')[1] for line in bddx_files]
    
intersection = list(set(mot_files).intersection(bddx_files))

# Load BDD-X
df_bddx = pd.read_csv('../data/BDD-X-Dataset/BDD-X-Annotations_v1.csv')

df_bddx.dropna(subset='Input.Video', inplace=True)
df_bddx = df_bddx[df_bddx['Input.Video'].str.startswith('https://s3-us-west-2.amazonaws.com/sq8geewpqu/')]

df_bddx['video_id'] = df_bddx['Input.Video'].apply(lambda x: x.split('https://s3-us-west-2.amazonaws.com/sq8geewpqu/')[1].split('/')[1].split('.mov')[0])

# Load BDD100k-labels (labels of 10th frame)
with open('../data/bdd100k-labels/labels/bdd100k_labels_images_train.json', 'r') as f:
    labels = json.load(f)

with open('../data/bdd100k-labels/labels/bdd100k_labels_images_val.json', 'r') as f:
    val_labels = json.load(f)

labels.extend(val_labels)

In [None]:
## Find sample image from MOT2020-images

# mot_img_files = os.listdir('../data/bdd100k-MOT2020/images/track/train/')
# list(set(intersection).intersection(mot_img_files))

## Find sample videos from train videos
# video_files = os.listdir("/Volumes/무제/bdd100k-2/videos/train/")
# video_files = [x.split('.mov')[0] for x in video_files]
# list(set(intersection).intersection(video_files))

In [438]:
def convert_to_time_format(start_second, end_second):
    # Convert numbers to string and add leading zeroes if necessary

    minute_str = str(0).zfill(2)
    start_second_str = str(start_second).zfill(2)
    end_second_str = str(end_second).zfill(2)

    # Join the strings with a colon
    time_format = minute_str + ":" + start_second_str + "-" + minute_str + ":" + end_second_str

    return time_format

def find_dict_by_name(name, list_of_seqs):
    for seq in list_of_seqs:
        if seq.get('name') == name:
            result = {}
            for item in seq['labels'][:4]:
                key = item['category'] + '-' + item['id']
                values = list(map(lambda x: round(x, 2), [item['box2d']['x1'], item['box2d']['x2'], item['box2d']['y1'], item['box2d']['y2']]))
                result[key] = values
            final = [f"{k}: {v}" for k, v in result.items()]
            return ', '.join(final)
    return None 

def find_label_by_name(name, list_of_labels):
    for data in list_of_labels:
        if data.get('name').split('.jpg')[0] == name:
            # Extract attributes from the dictionary
            weather = data["attributes"]["weather"]
            scene = data["attributes"]["scene"]
            time_of_day = data["attributes"]["timeofday"]

            # Create the desired string format
            result_string = f"weather: {weather}, scene: {scene}, time of day: {time_of_day}"

            return result_string

    return None

In [None]:
video_all = []
is_attr = False

for i, row in df_bddx.iterrows():
    print(i)

    video_id = row['video_id']
    desc = ''

    if  video_id in intersection:

        is_attr = True

        with open(f'../data/bdd100k-MOT2020/labels/box_track_20/train/{video_id}.json', 'r') as f:
            objects_json = json.load(f)
            seq_len = len(objects_json)

    # label = find_label_by_name(video_id, labels)
    # if label:
    #     desc += label + "\n"

    for k in range(1,16):

        if pd.isnull(row[f'Answer.{k}start']):
            break

        start_time = str(int(row[f'Answer.{k}start']))
        
        if pd.isnull(row[f'Answer.{k}end']):
            end_time = '40'
        elif row[f'Answer.{k}end'] == 41.0:
            end_time = '40'
        else:
            end_time = str(int(row[f'Answer.{k}end']))

        time = convert_to_time_format(start_time, end_time)

        if pd.isnull(row[f'Answer.{k}action']):
            continue
        elif pd.isnull(row[f'Answer.{k}justification']):
            action = row[f'Answer.{k}action']
        else:
            action = row[f'Answer.{k}action'] + ' ' + row[f'Answer.{k}justification']
        
        if is_attr:

            if k == 1:
                start_seq_name = video_id + '-' + '1'.zfill(7) + '.jpg'
            else:
                start_seq_name = video_id + '-' + str(int(round(seq_len * int(start_time)/40, 0))).zfill(7) + '.jpg'
            start_seq_items = find_dict_by_name(start_seq_name, objects_json)

            if end_time == '0':
                end_seq_name = video_id + '-' + '1'.zfill(7) + '.jpg'
            else:
                end_seq_name = video_id + '-' + str(int(round(seq_len * int(end_time)/40, 0))).zfill(7) + '.jpg'
                
            end_seq_items = find_dict_by_name(end_seq_name, objects_json)

            seq_str = ' '.join([time, action, f'[Objects at {start_time} sec]', start_seq_items, f'[Objects at {end_time} sec]', end_seq_items, '\n'])

        else:
            seq_str = ' '.join([time, action, '\n'])

        desc += seq_str

    video = {
        "video_id": video_id,
        "is_attr": is_attr,
        "desc": desc,
    }

    video_all.append(video)

    is_attr = False

In [452]:
import json

# Save
# with open('../BDD-captions.json', 'w', encoding="utf-8") as f:
#     json.dump(video_all, f, ensure_ascii=False, indent=2)

# Load
# with open('../BDD-captions.json', 'r', encoding="utf-8") as f:
#     video_all = json.load(f)