In [1]:
import os
import glob
import sys
import requests
import json
import pandas as pd
import math
from tqdm import tqdm
import shutil
import uuid
from pathlib import Path


ROOT_DIR = '' # Path to the folder containing the audio files
UUID_DEST_DIR = f'{ROOT_DIR}uuids_for_upload/' # Path to the folder where the uuid named files will be stored

reference_transcription = None
host = "" # Host URL
is_marked_for_review = False
segmentations = []
api_key = '' # API key
headers = {"Authorization": api_key}
audio_metadata_list = pd.read_csv('q2_segmented_audio_list.csv')

usernames = [f'annotator_{i}' for i in range(1, 6)]
# usernames = ['admin']
audio_files = [folder for folder in os.listdir(ROOT_DIR) if (folder != '.DS_Store') and (len(folder.split('.')) < 2)]
sub_audio_files = { f'{audio_file}': glob.glob(f'{ROOT_DIR}/{audio_file}/*.mp3') for audio_file in audio_files }

In [2]:
# Create a UUID named file for each audio file
original_to_uuid = {}

audio_file_paths = [audio_file_list for audio_file_list in sub_audio_files.values()]
# flatten the list
audio_file_paths = [item for sublist in audio_file_paths for item in sublist]

for original_audio_file in tqdm(audio_file_paths):
    extension = Path(original_audio_file).suffix.lower()
    original_file_name = Path(original_audio_file).stem
    uuid_filename = f"{str(uuid.uuid4().hex)}{extension}"

    original_to_uuid[original_file_name] = uuid_filename

    if not os.path.exists(UUID_DEST_DIR):
        os.makedirs(UUID_DEST_DIR)

    uuid_file_path = Path(UUID_DEST_DIR).joinpath(uuid_filename)

    shutil.copy(original_audio_file, uuid_file_path.as_posix())


100%|██████████| 99042/99042 [00:41<00:00, 2391.73it/s]


In [3]:
# Update the audio_metadata_list with uuid_filename and transcription

updated_columns = {
    'segmented_filename': [],
    'uuid_filename': [],
    'reference_transcription': []
}

for seg_file_name_with_ext in audio_metadata_list['segmented_filename']:
    seg_file_name = seg_file_name_with_ext.split('.')[0]
    uuid_file_name = original_to_uuid[seg_file_name]
    reference_transcription = ''

    updated_columns['segmented_filename'].append(seg_file_name_with_ext)
    updated_columns['uuid_filename'].append(uuid_file_name)
    updated_columns['reference_transcription'].append(reference_transcription)

# update the audio_metadata_list
audio_metadata_list['segmented_filename'] = updated_columns['segmented_filename']
audio_metadata_list['uuid_filename'] = updated_columns['uuid_filename']
audio_metadata_list['reference_transcription'] = updated_columns['reference_transcription']

In [4]:
audio_metadata_list.to_csv('/Users/zawhtetaung/Downloads/q2_segmented_audio_list_with_uuid.csv', index=False)

In [5]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

assigned_audios = split(audio_files, len(usernames))

In [None]:
# Upload the audio files one by one
for username, assigned_audio in zip(usernames, assigned_audios):
    for audio_name in assigned_audio:
        files = sub_audio_files[audio_name]
        files.sort()
        for file in files:
            audio_path = Path(file)
            audio_filename = audio_path.name
            start_time = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].start_time.values[0]
            end_time = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].end_time.values[0]

            if audio_path.is_file():
                audio_obj = open(audio_path.resolve(), "rb")
            else:
                print("Audio file does not exist")
                exit()

            file = {"audio_file": (audio_filename, audio_obj)}

            values = {
                "reference_transcription": reference_transcription,
                "username": username,
                "segmentations": segmentations,
                "is_marked_for_review": is_marked_for_review,
                "youtube_start_time": start_time,
                "youtube_end_time": end_time
            }

            print("Creating datapoint")
            response = requests.post(
                f"http://{host}/api/data", files=file, data=values, headers=headers
            )

            if response.status_code == 201:
                response_json = response.json()
                print(f"Message: {response_json['message']}")
            else:
                print(f"Error Code: {response.status_code}")
                response_json = response.json()
                print(f"Message: {response_json['message']}")

In [11]:
# Upload the audio files metadata in one go
for username, assigned_audio in zip(usernames, assigned_audios):
    audio_filenames = []
    uuid_filenames = []
    reference_transcriptions = []
    start_times = []
    end_times = []

    for audio_name in assigned_audio:
        files = sub_audio_files[audio_name]
        for file in files:
            audio_path = Path(file)
            audio_filename = audio_path.name
            uuid_filename = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].uuid_filename.values[0]
            reference_transcription = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].reference_transcription.values[0]
            start_time = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].start_time.values[0]
            end_time = audio_metadata_list[audio_metadata_list['segmented_filename'] == audio_filename].end_time.values[0]
            
            audio_filenames.append(audio_filename)
            uuid_filenames.append(uuid_filename)
            reference_transcriptions.append(reference_transcription)
            start_times.append(start_time)
            end_times.append(end_time)

    print(len(audio_filenames))
    print(len(uuid_filenames))

    values = {
        "reference_transcriptions": reference_transcriptions,
        "username": username,
        "audio_filenames": audio_filenames,
        "uuid_filenames": uuid_filenames,
        "youtube_start_times": start_times,
        "youtube_end_times": end_times
    }

    print("Creating dataset")
    response = requests.post(
        f"https://{host}/api/register-dataset", data=values, headers=headers
    )

    if response.status_code == 201:
        print(f"Message: success")
    else:
        print(f"Error Code: {response.status_code}")

20732
20732
Creating dataset
Error Code: 502
21365
21365
Creating dataset
Error Code: 502
18862
18862
Creating dataset
Error Code: 502
19463
19463
Creating dataset
Error Code: 502
