In [64]:
%load_ext autoreload
%autoreload 2
import os
import torch
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# NOTE: Need this to load tensors from .pkl on a CPU machine (.pkl tensors were originally saved on GPU)
# NOTE: Run this cell only once at the beginning of the notebook
torch.serialization.register_package(0, lambda x: x.device.type, lambda x, _: x.cpu())

In [65]:
# base dataset path
DATASET_ENTITY_COUNT = 20_000 # 20_000
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}-4-modalities/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"

assert os.path.exists(METADATA_PATH)
assert os.path.exists(IMAGES_PATH)
assert os.path.exists(vector_path)

In [66]:
# audio+video paths
AUDIO_PATH = DATASET_BASE_PATH + "audios/"
VIDEO_PATH = DATASET_BASE_PATH + "videos/"

In [67]:
import re
import pickle

def get_vectors_from_pkl(embeddings_path):
    with open(embeddings_path, "rb") as f:
        vectors = pickle.load(f)
    print(f"Loaded {len(vectors)} vectors from {embeddings_path}")
    return vectors

def extract_number_from_filename(filename, pattern=r"audio-tangoflux-(\d+).wav"):
    # match it to the regex pattern
    match = re.search(pattern, filename)
    if match:
        return int(match.group(1))
    raise ValueError(f"Filename {filename} does not match the expected pattern.")

def print_num_files_in_directory(directory, extension):
    files = [f for f in os.listdir(directory) if f.endswith(extension)]
    print(f"Number of {extension} files in {directory}: {len(files)}")
    return len(files)

In [71]:
print_num_files_in_directory(AUDIO_PATH, "wav")
audio_dict = get_vectors_from_pkl(AUDIO_PATH + "audio_embeddings.pkl")
audio_ids = [extract_number_from_filename(filename, r"audio-tangoflux-(\d+).wav") for filename in audio_dict.keys()]
audio_ids_set = set(audio_ids)
assert len(audio_ids_set) == len(audio_ids)

#check if the audio ids are actually in the folder
for i in audio_ids_set:
    assert os.path.exists(os.path.join(AUDIO_PATH, f"audio-tangoflux-{i}.wav")), f"Audio file {i} does not exist"

len(audio_ids_set)

Number of wav files in /Users/yavuz/data/LAION-20000-4-modalities/audios/: 9855
Loaded 9855 vectors from /Users/yavuz/data/LAION-20000-4-modalities/audios/audio_embeddings.pkl


9855

In [72]:
print_num_files_in_directory(VIDEO_PATH, "mp4")
video_dict = get_vectors_from_pkl(VIDEO_PATH + "video_embeddings.pkl")
video_ids = [extract_number_from_filename(filename, r"video-(\d+).mp4") for filename in video_dict.keys()]
video_ids_set = set(video_ids)
assert len(video_ids_set) == len(video_ids)

# check if the video ids are actually in the folder
for i in video_ids_set:
    assert os.path.exists(os.path.join(VIDEO_PATH, f"video-{i}.mp4")), f"Video file {i} does not exist"

len(video_ids_set)

Number of mp4 files in /Users/yavuz/data/LAION-20000-4-modalities/videos/: 9305
Loaded 9304 vectors from /Users/yavuz/data/LAION-20000-4-modalities/videos/video_embeddings.pkl


9304

In [73]:
# get intersection
valid_ids_set = audio_ids_set.intersection(video_ids_set)
valid_ids = list(valid_ids_set)
print(f"Number of valid ids: {len(valid_ids)}")
valid_ids.sort()

Number of valid ids: 9290


In [74]:
orig_metadata = pd.read_parquet(METADATA_PATH)
num_orig_entities = orig_metadata.index[-1] + 1
assert num_orig_entities == len(orig_metadata)

orig_entity_ids = set(range(0, num_orig_entities))

# assert valid keys are all present in the original metadata
assert len(valid_ids_set - orig_entity_ids) == 0

# identify which ids to remove
ids_to_remove = list(orig_entity_ids - valid_ids_set)
ids_to_remove.sort()
ids_to_remove

[47,
 140,
 183,
 195,
 210,
 261,
 267,
 332,
 374,
 401,
 424,
 428,
 456,
 517,
 530,
 560,
 613,
 620,
 631,
 718,
 792,
 798,
 942,
 960,
 1021,
 1023,
 1155,
 1175,
 1181,
 1223,
 1226,
 1417,
 1506,
 1512,
 1563,
 1650,
 1758,
 1851,
 1889,
 1961,
 1981,
 1988,
 1993,
 2003,
 2068,
 2201,
 2334,
 2603,
 2604,
 2635,
 2683,
 2737,
 2863,
 2966,
 3185,
 3271,
 3292,
 3402,
 3434,
 3449,
 3455,
 3494,
 3550,
 3568,
 3855,
 3863,
 4173,
 4208,
 4310,
 4311,
 4317,
 4319,
 4371,
 4414,
 4521,
 4532,
 4534,
 4536,
 4537,
 4539,
 4570,
 4643,
 4821,
 4822,
 4844,
 4871,
 4877,
 5029,
 5095,
 5108,
 5162,
 5269,
 5311,
 5414,
 5433,
 5759,
 5853,
 5883,
 5985,
 5988,
 5990,
 5994,
 6121,
 6126,
 6312,
 6410,
 6439,
 6490,
 6503,
 6793,
 6827,
 6921,
 6930,
 6980,
 6987,
 7071,
 7260,
 7281,
 7333,
 7359,
 7441,
 7692,
 7725,
 7802,
 7825,
 7866,
 7877,
 8019,
 8054,
 8057,
 8245,
 8563,
 8606,
 8712,
 8768,
 8787,
 8932,
 9079,
 9155,
 9158,
 9159,
 9160,
 9161,
 9162,
 9163,
 9164,
 91

In [75]:
# save cleaned-id (for two modality dataset)
orig_metadata.insert(0, "2-modality-id", orig_metadata.index)
# rename column "index" to "orig-id"
orig_metadata.rename(columns={"index": "orig-id"}, inplace=True)
orig_metadata

Unnamed: 0,2-modality-id,orig-id,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
0,0,2,3.372497e+12,https://farm1.staticflickr.com/784/40182677504...,Anhui Mountains,800.0,514.0,?,UNLIKELY,0.316512
1,1,3,3.820200e+11,https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...,Acute pain in a woman knee,257.0,240.0,?,UNLIKELY,0.344278
2,2,5,2.179119e+12,https://i.pinimg.com/236x/03/38/05/0338055833e...,Essentials Barnwood 70-inch TV Media Stand,236.0,236.0,?,UNLIKELY,0.332799
3,3,7,1.727450e+11,http://cdn.pastemagazine.com/www/articles/2011...,Ben Affleck Could Be Latest Addition To <em>Th...,320.0,320.0,?,UNLIKELY,0.353303
4,4,8,3.138980e+12,https://chambermaster.blob.core.windows.net/im...,Minnesota Department of Transportation,200.0,112.0,?,UNLIKELY,0.341640
...,...,...,...,...,...,...,...,...,...,...
12619,12619,19994,4.063788e+12,https://publicauthordotcom.files.wordpress.com...,Letters over the Wall cover,205.0,300.0,?,UNLIKELY,0.303382
12620,12620,19995,1.530866e+12,https://www.digsdigs.com/photos/sweet-shabby-c...,Http Www Digsdigs Com 33 Sweet Shabby Chic Bed...,480.0,486.0,?,UNLIKELY,0.352294
12621,12621,19997,4.247173e+12,https://goalstudio.com/web/product/medium/2020...,TOTTENHAM 940 BALL CAP - GOLD,1100.0,1460.0,?,UNLIKELY,0.388634
12622,12622,19998,1.505120e+11,https://thumbs.dreamstime.com/m/clown-toy-colo...,Clown toy color vector illustration Royalty Fr...,92.0,130.0,?,UNLIKELY,0.349995


In [76]:
orig_metadata.iloc[47]

2-modality-id                                                   47
orig-id                                                         74
SAMPLE_ID                                           135817000493.0
URL              http://rlv.zcache.ca/im_killing_you_off_next_b...
TEXT                       I'm Killing You Off Next Bumper Sticker
HEIGHT                                                       324.0
WIDTH                                                        324.0
LICENSE                                                          ?
NSFW                                                      UNLIKELY
similarity                                                0.305039
Name: 47, dtype: object

In [77]:
# select rows for new dataset, and reset
four_modality_metadata = orig_metadata.iloc[valid_ids]
four_modality_metadata = four_modality_metadata.reset_index(drop=True)
four_modality_metadata.iloc[47]

2-modality-id                                                   48
orig-id                                                         76
SAMPLE_ID                                             8565018316.0
URL              https://www.etrailer.com/Merchant2/graphics/00...
TEXT             Derale 2005 Jeep Grand Cherokee Transmission C...
HEIGHT                                                       123.0
WIDTH                                                        150.0
LICENSE                                                          ?
NSFW                                                      UNLIKELY
similarity                                                0.309337
Name: 47, dtype: object

In [78]:
# save the new metadata
if os.path.exists(DATASET_BASE_PATH + "metadata-4-modalities.parquet"):
    raise ValueError("metadata-4-modalities.parquet already exists!")
four_modality_metadata.to_parquet(DATASET_BASE_PATH + "metadata-4-modalities.parquet")

# rename old modality metadata files
old_metadata_path = DATASET_BASE_PATH + "metadata.parquet"
old_metadata_with_placeholder_path = DATASET_BASE_PATH + "metadata_with_placeholders.parquet"
if os.path.exists(old_metadata_path):
    os.rename(old_metadata_path, DATASET_BASE_PATH + "OLD-2-modality-metadata.parquet")
if os.path.exists(old_metadata_with_placeholder_path):
    os.rename(old_metadata_with_placeholder_path, DATASET_BASE_PATH + "OLD-2-modality-metadata-with-placeholders.parquet")

In [79]:
# make the dataset consistent
assert valid_ids == sorted(valid_ids)
valid_ids, ids_to_remove 

([0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  1

In [80]:
ids_to_remove

[47,
 140,
 183,
 195,
 210,
 261,
 267,
 332,
 374,
 401,
 424,
 428,
 456,
 517,
 530,
 560,
 613,
 620,
 631,
 718,
 792,
 798,
 942,
 960,
 1021,
 1023,
 1155,
 1175,
 1181,
 1223,
 1226,
 1417,
 1506,
 1512,
 1563,
 1650,
 1758,
 1851,
 1889,
 1961,
 1981,
 1988,
 1993,
 2003,
 2068,
 2201,
 2334,
 2603,
 2604,
 2635,
 2683,
 2737,
 2863,
 2966,
 3185,
 3271,
 3292,
 3402,
 3434,
 3449,
 3455,
 3494,
 3550,
 3568,
 3855,
 3863,
 4173,
 4208,
 4310,
 4311,
 4317,
 4319,
 4371,
 4414,
 4521,
 4532,
 4534,
 4536,
 4537,
 4539,
 4570,
 4643,
 4821,
 4822,
 4844,
 4871,
 4877,
 5029,
 5095,
 5108,
 5162,
 5269,
 5311,
 5414,
 5433,
 5759,
 5853,
 5883,
 5985,
 5988,
 5990,
 5994,
 6121,
 6126,
 6312,
 6410,
 6439,
 6490,
 6503,
 6793,
 6827,
 6921,
 6930,
 6980,
 6987,
 7071,
 7260,
 7281,
 7333,
 7359,
 7441,
 7692,
 7725,
 7802,
 7825,
 7866,
 7877,
 8019,
 8054,
 8057,
 8245,
 8563,
 8606,
 8712,
 8768,
 8787,
 8932,
 9079,
 9155,
 9158,
 9159,
 9160,
 9161,
 9162,
 9163,
 9164,
 91

In [81]:
import numpy as np

def save_audio_vectors(audio_dict, valid_ids, save_path):
    audio_vectors = []
    for i in valid_ids:
        vector = audio_dict[f"audio-tangoflux-{i}.wav"]
        audio_vectors.append(vector)
    # save as float32 type
    audio_vectors = np.array(audio_vectors, dtype=np.float32)
    # save to disk
    np.save(save_path + "audio_vectors.npy", audio_vectors)
    return audio_vectors

def save_video_vectors(video_dict, valid_ids, save_path):
    video_vectors = []
    for i in valid_ids:
        vector = video_dict[f"video-{i}.mp4"]
        video_vectors.append(vector)
    # save as float32 type
    video_vectors = np.array(video_vectors, dtype=np.float32)
    # save to disk
    np.save(save_path + "video_vectors.npy", video_vectors)
    return video_vectors

def update_and_save_existing_vectors(valid_ids, old_file_path, new_file_path):
    old_vectors = np.load(old_file_path)
    new_vectors = []
    for i in valid_ids:
        vector = old_vectors[i]
        new_vectors.append(vector)
    new_vectors = np.array(new_vectors, dtype=np.float32)
    print(f"Saving new vectors to {new_file_path}")
    np.save(new_file_path, new_vectors)
    return new_vectors


In [82]:
# save new, consistent vectors
FOUR_MODALITY_VECTORS_PATH = DATASET_BASE_PATH + "vectors-4-modalities/"

if not os.path.exists(FOUR_MODALITY_VECTORS_PATH):
    os.makedirs(FOUR_MODALITY_VECTORS_PATH)

# save audio vectors
save_audio_vectors(audio_dict, valid_ids, save_path=FOUR_MODALITY_VECTORS_PATH)
save_video_vectors(video_dict, valid_ids, save_path=FOUR_MODALITY_VECTORS_PATH)

array([[-0.07232673,  0.24127662,  0.00408068, ...,  0.47421062,
        -0.12116038, -0.1425512 ],
       [-0.22986785,  0.2889499 ,  0.35783616, ...,  0.5551165 ,
        -0.02364332, -0.27442387],
       [-0.32830358,  0.02503386,  0.2840059 , ...,  0.24527553,
        -0.01655833, -0.34693652],
       ...,
       [-0.26286885,  0.18625045,  0.03648443, ...,  0.60773325,
        -0.100024  , -0.17280485],
       [ 0.01680909,  0.093916  , -0.15008621, ...,  0.6625208 ,
        -0.01455795,  0.06347294],
       [-0.24877423,  0.10646566,  0.11062509, ...,  0.8477432 ,
        -0.25061142,  0.11786871]], dtype=float32)

In [83]:
# save image vectors
update_and_save_existing_vectors(valid_ids,vector_path + "image_vectors.npy", FOUR_MODALITY_VECTORS_PATH + "image_vectors.npy")

# save text vectors
update_and_save_existing_vectors(valid_ids,vector_path + "text_vectors.npy", FOUR_MODALITY_VECTORS_PATH + "text_vectors.npy")


Saving new vectors to /Users/yavuz/data/LAION-20000-4-modalities/vectors-4-modalities/image_vectors.npy
Saving new vectors to /Users/yavuz/data/LAION-20000-4-modalities/vectors-4-modalities/text_vectors.npy


array([[-0.05608249, -0.02341046,  0.01316727, ..., -0.0419376 ,
        -0.05870903,  0.04695863],
       [-0.07713753, -0.02388584,  0.01643499, ..., -0.09213278,
         0.01431111,  0.00712732],
       [-0.02392489, -0.00422221,  0.05140388, ..., -0.02613676,
         0.06291696,  0.04244908],
       ...,
       [ 0.00335214,  0.01006563,  0.06203266, ...,  0.0092228 ,
         0.05897678,  0.04107948],
       [-0.03117707, -0.03366623,  0.06108868, ...,  0.00172222,
        -0.00517997,  0.00838777],
       [-0.05676303, -0.01214353,  0.0009351 , ..., -0.053553  ,
        -0.00155859,  0.03433016]], dtype=float32)

In [84]:
# assert consistency of the vectors
audio_vectors = np.load(FOUR_MODALITY_VECTORS_PATH + "audio_vectors.npy")
video_vectors = np.load(FOUR_MODALITY_VECTORS_PATH + "video_vectors.npy")
image_vectors = np.load(FOUR_MODALITY_VECTORS_PATH + "image_vectors.npy")
text_vectors = np.load(FOUR_MODALITY_VECTORS_PATH + "text_vectors.npy")

assert audio_vectors.shape[0] == len(valid_ids)
assert video_vectors.shape[0] == len(valid_ids)
assert image_vectors.shape[0] == len(valid_ids)
assert text_vectors.shape[0] == len(valid_ids)

assert audio_vectors.dtype == np.float32
assert video_vectors.dtype == np.float32
assert image_vectors.dtype == np.float32
assert text_vectors.dtype == np.float32

assert audio_vectors.shape[1] == 768
assert video_vectors.shape[1] == 512
assert image_vectors.shape[1] == 768
assert text_vectors.shape[1] == 384

In [85]:
# check the old vectors
old_text_vectors = np.load(vector_path + "text_vectors.npy")
old_image_vectors = np.load(vector_path + "image_vectors.npy")

assert (old_text_vectors[0] == text_vectors[0]).all()
assert (old_image_vectors[0] == image_vectors[0]).all()
assert (old_text_vectors[48] == text_vectors[47]).all() # Note: 47 was removed in new one
assert (old_image_vectors[48] == image_vectors[47]).all() # Note: 47 was removed in new one
# assert (old_text_vectors[-1] == text_vectors[-1]).all() # Note: last item is the same for 150 dataset but not 20k
# assert (old_image_vectors[-1] == image_vectors[-1]).all() # Note: last item is the same for 150 dataset but not 20k

In [86]:
# Now, we update the raw data (images, audios, videos) to be consistent with the ids
import shutil
import glob
from tqdm import tqdm

In [87]:
# update images to be consistent with the ids
from src.dataset_processing.data_download import move_files

def move_invalid_images_to_subfolder(ids_to_remove, images_path):
    for i in ids_to_remove:
        # get name of the file for the image
        shard = str(i // 10000).zfill(5)
        index = str(i % 10000).zfill(4)
        image_path = os.path.join(images_path, shard, f"{shard}{index}.jpg")
        json_path = image_path.replace(".jpg", ".json")

        # assert the file exists
        assert os.path.exists(image_path), f"Image {image_path} does not exist."
        assert os.path.exists(json_path), f"Image {json_path} does not exist."

        # move to placeholder directory
        invalid_directory = os.path.join(images_path, shard, "invalid_images")
        os.makedirs(invalid_directory, exist_ok=True)

        new_image_path = os.path.join(invalid_directory, f"{shard}{index}.jpg")
        new_json_path = new_image_path.replace(".jpg", ".json")

        # assert that the new image path does not exist
        assert not os.path.exists(new_image_path), f"Image {new_image_path} already exists."
        assert not os.path.exists(new_json_path), f"Image {new_json_path} already exists."

        os.rename(image_path, new_image_path)
        os.rename(json_path, new_json_path)
    print("Moved images without audio/video to invalid_images/ subfolders")

In [88]:
9155 in ids_to_remove

True

In [89]:
# move invalid images to subfolder
move_invalid_images_to_subfolder(ids_to_remove, IMAGES_PATH)

# rename files to be sequentially increasing, to be consistent with everything else
move_files(IMAGES_PATH)

Moved images without audio/video to invalid_images/ subfolders


100%|██████████| 9290/9290 [00:03<00:00, 2402.15it/s]


In [90]:
path = os.path.join(VIDEO_PATH, f"{47}.mp4")
path

'/Users/yavuz/data/LAION-20000-4-modalities/videos/47.mp4'

In [91]:
# update videos to be consistent with the ids (i.e. sequentially increasing ids)
def move_invalid_videos_to_subfolder(ids_to_remove, videos_path):
    for invalid_id in ids_to_remove:
        video_file = os.path.join(videos_path, f"video-{invalid_id}.mp4")
        if os.path.exists(video_file):
            # move to placeholder directory
            invalid_directory = os.path.join(videos_path, "invalid_videos")
            os.makedirs(invalid_directory, exist_ok=True)
            new_video_path = os.path.join(invalid_directory, f"video-{invalid_id}.mp4")
            # assert that the new video path does not exist
            assert not os.path.exists(new_video_path), f"Video {new_video_path} already exists."
            os.rename(video_file, new_video_path)
    
def move_videos(videos_path):
    """Rename and move video files for continuous indexing."""
    # move invalid videos to subfolder
    files = glob.glob(os.path.join(videos_path, "*.mp4"))
    # sort the files by the integer k in video-k.mp4
    files.sort(key=lambda x: int(re.search(r"video-(\d+).mp4", x).group(1)))
    for i, file_path in enumerate(tqdm(files)):
        new_video_path = os.path.join(videos_path, f"{i}.mp4")
        shutil.move(file_path, new_video_path)
    print("Moved videos to consistent sequentially increasing ids")

In [92]:
# remove invalid videos
move_invalid_videos_to_subfolder(ids_to_remove, VIDEO_PATH)
# move videos to consistent sequentially increasing ids
move_videos(VIDEO_PATH)

100%|██████████| 9290/9290 [00:01<00:00, 5595.41it/s]

Moved videos to consistent sequentially increasing ids





In [93]:
# assert video path contains correct number of videos
video_files = glob.glob(os.path.join(VIDEO_PATH, "*.mp4"))
assert len(video_files) == len(valid_ids), f"Number of videos {len(video_files)} does not match number of valid ids {len(valid_ids)}"

In [94]:
def move_invalid_audios_to_subfolder(ids_to_remove, audios_path):
    for invalid_id in ids_to_remove:
        audio_file = os.path.join(audios_path, f"audio-tangoflux-{invalid_id}.wav")
        if os.path.exists(audio_file):
            # move to placeholder directory
            invalid_directory = os.path.join(audios_path, "invalid_audios")
            os.makedirs(invalid_directory, exist_ok=True)
            new_audio_path = os.path.join(invalid_directory, f"audio-tangoflux-{invalid_id}.wav")
            # assert that the new audio path does not exist
            assert not os.path.exists(new_audio_path), f"Audio {new_audio_path} already exists."
            os.rename(audio_file, new_audio_path)

def move_audios(audios_path):
    """Rename and move audio files for continuous indexing."""
    files = glob.glob(os.path.join(audios_path, "*.wav"))
    # sort the files by the integer k in audio-tangoflux-k.wav
    files.sort(key=lambda x: int(re.search(r"audio-tangoflux-(\d+).wav", x).group(1)))
    for i, file_path in enumerate(tqdm(files)):
        new_audio_path = os.path.join(audios_path, f"{i}.wav")
        shutil.move(file_path, new_audio_path)
    print("Moved audios to consistent sequentially increasing ids")

In [95]:
# remove invalid audios
move_invalid_audios_to_subfolder(ids_to_remove, AUDIO_PATH)

# move audios to consistent sequentially increasing ids
move_audios(AUDIO_PATH)

100%|██████████| 9290/9290 [00:01<00:00, 5346.12it/s]

Moved audios to consistent sequentially increasing ids





In [96]:
# assert audio path contains correct number of videos
audio_files = glob.glob(os.path.join(AUDIO_PATH, "*.wav"))
assert len(audio_files) == len(valid_ids), f"Number of audios {len(audio_files)} does not match number of valid ids {len(valid_ids)}"