# Generate dataset csv

Download the captions csv from: [Cap3D](https://huggingface.co/datasets/tiange/Cap3D) and place it inside the `root_directory` of the blenderproc generated dataset.

In [1]:
# %load_ext cudf.pandas
import autoroot
import autorootcwd
import glob
import os
import random
import h5py
import numpy as np
import json
import pickle
from typing import List
from pathlib import Path
from matplotlib import pyplot as plt

import pandas as pd

In [2]:
random.seed(0)

In [3]:
# the following structure is assumed
root_directory = Path("data/blenderproc")
objs_directory = root_directory / "hf-objaverse-v4"
objaverse_manual_captions = root_directory / "objaverse_cat_descriptions_64k.json"
captions_path = root_directory / "Cap3D_automated_Objaverse_full.csv"
objaverse_split_all = root_directory / "objaverse_split_all.csv"
abo_renderings = root_directory / "abo_v4"
abo_human = root_directory / "Cap3D_human_ABO.pkl" # not reliable since variable length
abo_automated = root_directory / "misc_Cap3D_automated_ABO.csv"
abo_classes = root_directory / "abo_metadata/metadata/abo_classes_3d.txt"
abo_captions_csv = root_directory / "abo_captions.csv"
abo_split_all = root_directory / "abo_split_all.csv"
abo_multiple_renderings = root_directory / "abo_multiple"
objaverse_hard_exclude = root_directory / "objaverse_hard_exclude.txt"
abo_objaverse_all = root_directory / "abo_objaverse_all.csv"

## ABO Captions exploration

In [5]:
with open(abo_human, "rb") as f:
    abo_human = pickle.load(f)
    print(abo_human)



In [6]:
captions_lengths = {}
for key, value in abo_human.items():
    if captions_lengths.get(len(value), None) is None:
        captions_lengths[len(value)] = 0
    captions_lengths[len(value)] += 1
print(captions_lengths)

{3: 1424, 1: 1586, 2: 1714, 5: 478, 4: 947, 8: 20, 7: 67, 6: 208, 10: 2, 9: 1}


In [7]:
print(len(abo_human))

6447


In [5]:
abo_automated_captions = pd.read_csv(abo_automated, names=["uid", "caption"])
print(abo_automated_captions.head(10))

          uid                                            caption
0  B01DJH742M  A rectilinear cabinet with a slanted roof and ...
1  B07B4N273D  3D model of a two-seater chair with a curved b...
2  B07DBK7KH8  3D model of a table lamp featuring a spherical...
3  B0842GQ5WJ  A cylinder with a spherical base and a spheric...
4  B07K7LVRSM  3D model of a trestle table and TV stand with ...
5  B07HKF28KS  A 3D lamp with a spherical shade, metal arm, s...
6  B07KSRJG2T  3D model of a rectangular bed frame with a tuf...
7  B07K7LVGHH  3D coffee table model with a rectangular top a...
8  B073P46SHL  A floor lamp with a spherical base, curved arm...
9  B084HV7BC3  A 3D skateboard with a concave bottom and conv...


In [7]:
len(abo_automated_captions)

6440

In [10]:
# print len of abo_human and abo_automated_captions union
print(len(set(abo_human.keys()) | set(abo_automated_captions.iloc[:, 0])))

6447


In [7]:
abo_classes_dict = {}
with open(abo_classes, "r") as f:
    for line in f:
        line = line.strip().split(',')
        abo_classes_dict[line[0]] = line[1]
print(len(abo_classes_dict))

7953


In [8]:
print(abo_classes_dict)

{'B07Y5Y1262': 'table', 'B07MJL3LW8': 'table', 'B07DBHC39X': 'lamp', 'B07HKGKSMM': 'lamp', 'B07B519FDQ': 'bed', 'B07QDNRBSX': 'chair', 'B07MQF8SF6': 'table', 'B07B4FZNM4': 'sofa', 'B07MSWZ5VP': 'bed', 'B002CM3J2A': 'dining set', 'B07VSTTTHV': 'chair', 'B084W2WJH2': 'chair', 'B07B51934S': 'rug', 'B07L8T38ND': 'sofa', 'B07JJYZ1LZ': 'chair', 'B07B8VJQG1': 'shelf', 'B075ZBW1QM': 'table', 'B074VLT2MB': 'pillow', 'B07BWLJR8S': 'sofa', 'B07R3TVZRV': 'chair', 'B07B4SF24F': 'rug', 'B07B8PXTVL': 'clock', 'B075YNHLD5': 'picture frame or painting', 'B07HSHG85J': 'rug', 'B0735X2M1J': 'chair', 'B003QTD4Y6': 'mouse pad', 'B0714MJKZW': 'rug', 'B07HK8YHB7': 'lamp', 'B07DBDY58B': 'lamp', 'B07QPZJ947': 'table', 'B07DTLWJ8F': 'lamp', 'B082QD61D9': 'chair', 'B07C8WMV19': 'pillow', 'B075HWMCDF': 'plant or flower pot', 'B0825DG9NH': 'lamp', 'B072Q1BNCQ': 'chair', 'B07HSF15FW': 'cabinet', 'B075QMGMYP': 'bed', 'B07KV3HBMR': 'table', 'B07FFWSBBF': 'dresser', 'B07HSHHBR9': 'rug', 'B07JYN8DBM': 'table', 'B07HSLQY

In [9]:
extra_uids = set(abo_classes_dict.keys()) - set(abo_automated_captions.iloc[:, 0].values)
print(len(extra_uids))

1513


In [10]:
# dict from abo_classes_dict with keys from extra_uids set
extra_classes = {k: abo_classes_dict[k] for k in extra_uids}
print(extra_classes)

{'B073P5YXCQ': 'picture frame or painting', 'B0735RHGT9': 'rug', 'B07NXTJV7L': 'rug', 'B073P13J4L': 'picture frame or painting', 'B072PW7DXQ': 'rug', 'B071ZJ6BGN': 'rug', 'B073P1LPPM': 'picture frame or painting', 'B073P1H6CN': 'picture frame or painting', 'B073P13P8T': 'picture frame or painting', 'B073WQ8JNK': 'picture frame or painting', 'B073P5KYCM': 'picture frame or painting', 'B07B4D88DY': 'sofa', 'B073P6BG6M': 'picture frame or painting', 'B073P51211': 'picture frame or painting', 'B075Z9CWQP': 'rug', 'B073P1LP93': 'picture frame or painting', 'B0732HXZQW': 'rug', 'B07HSNF921': 'rug', 'B073P5QKD9': 'picture frame or painting', 'B0735TH3VY': 'rug', 'B073P6HGTV': 'picture frame or painting', 'B07B4ZQVXT': 'rug', 'B07B52FS9W': 'rug', 'B07K8Z4VGB': 'sofa', 'B07B4WH25K': 'rug', 'B07HSN8783': 'rug', 'B0732DWG63': 'rug', 'B073P1BVBW': 'picture frame or painting', 'B073P6L988': 'picture frame or painting', 'B07HSL66QT': 'rug', 'B073NZPN2W': 'picture frame or painting', 'B07Q9TCY28': 'r

In [11]:
# concat extra_classes to abo_automated_captions
extra_classes_df = pd.DataFrame(extra_classes.items(), columns=["uid", "caption"])
abo_automated_captions = pd.concat([abo_automated_captions, extra_classes_df])

In [12]:
abo_automated_captions.head(10)

Unnamed: 0,uid,caption
0,B01DJH742M,A rectilinear cabinet with a slanted roof and ...
1,B07B4N273D,3D model of a two-seater chair with a curved b...
2,B07DBK7KH8,3D model of a table lamp featuring a spherical...
3,B0842GQ5WJ,A cylinder with a spherical base and a spheric...
4,B07K7LVRSM,3D model of a trestle table and TV stand with ...
5,B07HKF28KS,"A 3D lamp with a spherical shade, metal arm, s..."
6,B07KSRJG2T,3D model of a rectangular bed frame with a tuf...
7,B07K7LVGHH,3D coffee table model with a rectangular top a...
8,B073P46SHL,"A floor lamp with a spherical base, curved arm..."
9,B084HV7BC3,A 3D skateboard with a concave bottom and conv...


In [13]:
# save abo_automated_captions to csv
abo_captions_path = root_directory / "abo_captions.csv"
abo_automated_captions.to_csv(abo_captions_path, index=False)

## Generate splits

In [4]:
def find_hdf5_files(root_dir, relative_dir=None):
    # Find all .hdf5 files in the directory and subdirectories
    file_paths = glob.glob(os.path.join(root_dir, "**", "*.hdf5"), recursive=True)
    # Convert to relative paths
    if relative_dir is None:
        relative_dir = root_dir
    relative_paths = [Path(file).relative_to(relative_dir) for file in file_paths]
    return relative_paths


def create_train_test_split(
    paths: List[Path], train_ratio=None, test_objs=250, shuffle=True
):
    uids = set([path.parent.name for path in paths])
    # Shuffle the paths
    if shuffle:
        random.shuffle(paths)
    # create train test split of paths based on test_objs uids. test_objs num of uids should be in the test set.
    if train_ratio is not None:
        test_objs = int(len(uids) * (1 - train_ratio))
    test_uids = set(random.sample(sorted(uids), test_objs))
    train_paths = [path for path in paths if path.parent.name not in test_uids]
    test_paths = [path for path in paths if path.parent.name in test_uids]
    return train_paths, test_paths


def create_all_split(paths, captions_df, save_path, automated_captions_df=None):
    # df of all uids and captions
    df_all = pd.DataFrame(paths, columns=["path"])
    df_all["uid"] = df_all["path"].apply(lambda x: Path(x).parent.name)
    # Merge df_all DataFrame with captions_df DataFrame on 'uid' column
    all_merged_df = pd.merge(df_all, captions_df, on="uid")
    if automated_captions_df is None:
        automated_captions_df = captions_df
    # Merge all_merged_df DataFrame with automated_captions_df DataFrame on 'uid' column and keep only `caption` column of automated_captions_df renamed as `auto_caption`
    all_merged_df = pd.merge(all_merged_df, automated_captions_df[['uid', 'caption']], on='uid', suffixes=('', '_auto'))
    all_merged_df.rename(columns={'caption_auto': 'auto_caption'}, inplace=True)
    print(f"creating split at {save_path}")
    all_merged_df.to_csv(save_path, index=False)
    return all_merged_df

In [5]:
# import abo captions
abo_captions_df = pd.read_csv(abo_captions_csv)

In [6]:
objaverse_uid_captions = {}
# read objaverse json file
with open(objaverse_manual_captions, "r") as f:
    objaverse_manual_captions = json.load(f)
    for uid, values in objaverse_manual_captions.items():
        objaverse_uid_captions[uid] = values["description"]
print(len(objaverse_uid_captions))

63999


Ensure that all bad UIDs are removed from renderings

In [7]:
# Find all folders containing HDF5 files
abo_paths = find_hdf5_files(abo_renderings, root_directory)
print(f"Found {len(abo_paths)} HDF5 files. Example: {abo_paths[0]}")

Found 23859 HDF5 files. Example: abo_v4/B/B07JY4H14B/0.hdf5


In [8]:
# Find all folders containing HDF5 files
objaverse_paths = find_hdf5_files(objs_directory, root_directory)
print(f"Found {len(objaverse_paths)} HDF5 files. Example: {objaverse_paths[0]}")

Found 174321 HDF5 files. Example: hf-objaverse-v4/000-142/45ee52b34d314255a87af6f4d0cf7b27/0.hdf5


In [9]:
uids_generated = set([path.parent.name for path in objaverse_paths])
# keep only those uids in objaverse_uid_captions that are there in uids_generated
objaverse_uid_captions = {k: objaverse_uid_captions[k] for k in uids_generated}

In [10]:
objaverse_captions_df = pd.DataFrame(
    objaverse_uid_captions.items(), columns=["uid", "caption"]
)
objaverse_captions_path = root_directory / "objaverse_captions.csv"
# objaverse_captions_df.to_csv(objaverse_captions_path, index=False)

In [11]:
objaverse_auto_captions_df = pd.read_csv(captions_path, names=["uid", "caption"])

In [12]:
abo_all_df = create_all_split(abo_paths, abo_captions_df, abo_split_all)

creating split at data/blenderproc/abo_split_all.csv


In [13]:
objaverse_all_df = create_all_split(objaverse_paths, objaverse_captions_df, objaverse_split_all, objaverse_auto_captions_df)

creating split at data/blenderproc/objaverse_split_all.csv


In [14]:
objaverse_tags = {}
for uid, values in objaverse_uid_captions.items():
    objaverse_tags[uid] = objaverse_manual_captions[uid]["category"]

In [15]:
# category numbers
category_counts = {}
for uid, category in objaverse_tags.items():
    if category not in category_counts:
        category_counts[category] = 0
    category_counts[category] += 1

# sort category_counts in ascending order
category_counts = dict(sorted(category_counts.items(), key=lambda x: x[1]))

In [16]:
print(category_counts)

{'clam': 1, 'scarecrow': 1, 'topographical_map': 1, 'medallion': 1, 'fly': 1, 'fries': 1, 'cathedral': 1, 'lift': 1, 'control_panel': 1, 'tennis_shoe': 1, 'mandarin_orange': 1, 'people': 1, 'parking_meter': 1, 'waffle': 1, 'donuts': 1, 'belt_buckle': 1, 'otter': 1, 'gas_station': 1, 'spike': 1, 'atv': 1, 'foundation': 1, 'cookies': 1, 'french_fries': 1, 'demon': 1, 'steps': 1, 'crab': 1, 'sclupture': 1, 'nutcracker': 1, 'corkboard': 1, 'decor': 1, 'tables': 1, 'pencil_box': 1, 'fossils': 1, 'noodles': 1, 'juice': 1, 'cantaloup': 1, 'hot_dog': 1, 'cauliflower': 1, 'beachball': 1, 'boom_microphone': 1, 'gingerbread_house': 1, 'part': 1, 'chai': 1, 'bouquet': 1, 'crystal_ball': 1, 'bown': 1, 'drawing': 1, 'monument': 1, 'toothpick': 1, 'cheetah': 1, 'relief_sculpture': 1, 'ice': 1, 'axe_head': 1, 'ipod': 1, 'cartoon_character.': 1, 'tunnel': 1, 'dishtowel': 1, 'igloo': 1, 'chocolate_milk': 1, 'jawbone': 1, 'stat': 1, 'metal': 1, 'pond': 1, 'vegetable': 1, 'wagon_wheel': 1, 'missle': 1, 'c

In [17]:
# list of categories with sum of category counts till 500 objects
novel_categories = set()
cnt = 0
for category, count in category_counts.items():
    if cnt >= 500:
        break
    novel_categories.add(category)
    cnt += count

In [18]:
print(len(novel_categories))
print(novel_categories)

285
{'wrestling_singlet', 'croissant', 'tote', 'sugar_bowl', 'hockey_stick', 'skate_park', 'soda', 'storage_box', 'skateboard_half_pipe', 'bulldog', 'skate', 'hoverboard', 'sunhat', 'floor_plan', 'chai', 'doorframe', 'demon', 'rasp', 'spade', 'specimen', 'pickax', 'belt_buckle', 'fries', 'hot_air_balloon', 'hand_axe', 'flare', 'turntable', 'lights', 'gas_station', 'stethoscope', 'goose', 'spectacles', 'juice', 'puzzle', 'crab', 'monument', 'trash', 'dollar', 'archery', 'mud', 'toilet_tissue_holder', 'hospital_bed', 'shelving_unit', 'human_head', 'rollerblade', 'ketchup', 'electrical_box', 'pencil_box', 'cheetah', 'cathedral', 'drink', 'parachute', 'filing_cabinet', 'filter', 'sail', 'clam', 'noodles', 'cartoon_character.', 'bullets', 'rocket_launcher', 'razor', 'dynamite', 'lawn_chair', 't-shirt', 'swimming_pool', 'dart', 'cigar_box', 'cigarettes', 'plug', 'slingshot', 'pillows', 'quiver', 'bunk_bed', 'bowling_pin', 'gong', 'mill', 'cellphone', 'spike', 'handkerchief', 'ammunition', 's

In [19]:
# save novel categories to a text file
novel_categories_path = root_directory / "novel_categories.txt"
with open(novel_categories_path, "w") as f:
    for item in novel_categories:
        f.write("%s\n" % item)

In [20]:
novel_category_paths = []
seen_category_paths = []
for path in objaverse_paths:
    uid = path.parent.name
    if objaverse_tags[uid] in novel_categories:
        novel_category_paths.append(path)
    else:
        seen_category_paths.append(path)

In [21]:
print(len(novel_category_paths), len(seen_category_paths))

1503 172818


In [22]:
objaverse_train_paths, objaverse_test_paths = create_train_test_split(seen_category_paths)

In [23]:
print(len(objaverse_train_paths), len(objaverse_test_paths))

172068 750


In [24]:
# append novel_category_paths to test_paths
objaverse_test_paths += novel_category_paths

In [25]:
print(len(objaverse_test_paths))

2253


In [26]:
# select objaverse_train_paths from objaverse_all_df to create objaverse_train_df
objaverse_train_df = objaverse_all_df[objaverse_all_df["path"].isin(objaverse_train_paths)]
# objaverse_train_df save to csv
objaverse_train_path = root_directory / "train_objaverse.csv"
objaverse_train_df.to_csv(objaverse_train_path, index=False)

In [27]:
# select objaverse_test_paths from objaverse_all_df to create objaverse_test_df
objaverse_test_df = objaverse_all_df[objaverse_all_df["path"].isin(objaverse_test_paths)]
# objaverse_test_df save to csv
objaverse_test_path = root_directory / "test_objaverse.csv"
objaverse_test_df.to_csv(objaverse_test_path, index=False)

save abo train, test csv's

In [28]:
abo_train_paths, abo_test_paths = create_train_test_split(abo_paths)
abo_train_df = abo_all_df[abo_all_df["path"].isin(abo_train_paths)]
abo_train_path = root_directory / "train_abo.csv"
abo_train_df.to_csv(abo_train_path, index=False)

abo_test_df = abo_all_df[abo_all_df["path"].isin(abo_test_paths)]
abo_test_path = root_directory / "test_abo.csv"
abo_test_df.to_csv(abo_test_path, index=False)

In [5]:
# load abo splits (only if above step is done and train_abo.csv is created)
abo_train_path = root_directory / "train_abo.csv"
abo_train_df = pd.read_csv(abo_train_path)
abo_test_path = root_directory / "test_abo.csv"
abo_test_df = pd.read_csv(abo_test_path)

Merge df all individual split df's

In [29]:
# merge abo and objaverse splits to get final splits
final_train_df = pd.concat([abo_train_df, objaverse_train_df])
final_test_df = pd.concat([abo_test_df, objaverse_test_df])

In [5]:
abo_all_df = pd.read_csv(abo_split_all)
objaverse_all_df = pd.read_csv(objaverse_split_all)
merged_df = pd.concat([abo_all_df, objaverse_all_df])
merged_df.to_csv(abo_objaverse_all, index=False)

In [30]:
# create a column is_novel in the final_test_df where is_novel is true if the final_test_df["path"] is in novel_category_paths
final_test_df["is_novel"] = final_test_df["path"].apply(
    lambda x: x in novel_category_paths
)

In [31]:
final_test_df.head(10)

Unnamed: 0,path,uid,caption,auto_caption,is_novel
57,abo_v4/B/B07QTKCKVB/0.hdf5,B07QTKCKVB,3D model of a cubic bookshelf with a corner de...,3D model of a cubic bookshelf with a corner de...,False
58,abo_v4/B/B07QTKCKVB/2.hdf5,B07QTKCKVB,3D model of a cubic bookshelf with a corner de...,3D model of a cubic bookshelf with a corner de...,False
59,abo_v4/B/B07QTKCKVB/1.hdf5,B07QTKCKVB,3D model of a cubic bookshelf with a corner de...,3D model of a cubic bookshelf with a corner de...,False
81,abo_v4/B/B075X342PB/0.hdf5,B075X342PB,"Ottoman with a square base, cuboid seat, and s...","Ottoman with a square base, cuboid seat, and s...",False
82,abo_v4/B/B075X342PB/2.hdf5,B075X342PB,"Ottoman with a square base, cuboid seat, and s...","Ottoman with a square base, cuboid seat, and s...",False
83,abo_v4/B/B075X342PB/1.hdf5,B075X342PB,"Ottoman with a square base, cuboid seat, and s...","Ottoman with a square base, cuboid seat, and s...",False
111,abo_v4/B/B07MBFDKXB/0.hdf5,B07MBFDKXB,Swivel bar stool with a buttoned upholstered s...,Swivel bar stool with a buttoned upholstered s...,False
112,abo_v4/B/B07MBFDKXB/2.hdf5,B07MBFDKXB,Swivel bar stool with a buttoned upholstered s...,Swivel bar stool with a buttoned upholstered s...,False
113,abo_v4/B/B07MBFDKXB/1.hdf5,B07MBFDKXB,Swivel bar stool with a buttoned upholstered s...,Swivel bar stool with a buttoned upholstered s...,False
144,abo_v4/B/B07HSJQ3DB/0.hdf5,B07HSJQ3DB,"Three-tiered metal cart with shelves, handles ...","Three-tiered metal cart with shelves, handles ...",False


In [32]:
# save final splits
final_train_save = root_directory / "train.csv"
final_test_save = root_directory / "test.csv"
final_train_df.to_csv(final_train_save, index=False)
final_test_df.to_csv(final_test_save, index=False)