## Download data from the web

https://mm.kaist.ac.kr/datasets/voxceleb/index.html#testlist

In [None]:
# download test data

# !curl -O https://mm.kaist.ac.kr/datasets/voxceleb/data/vox2_test_txt.zip
# !mkdir -p ../data/test
# !unzip vox2_test_txt.zip -d ../data/test

In [None]:
# download dev data

# !curl -O https://mm.kaist.ac.kr/datasets/voxceleb/data/vox2_dev_txt.zip
# !mkdir -p ../data/dev
# !unzip vox2_dev_txt.zip -d ../data/dev

## Get 2 sample files for each speaker in test dataset

In [None]:
from pathlib import Path
import pandas as pd


IN_FOLDER = "../data/test/txt"
OUT_DIR = "../data/test_data"

data = []

for speaker_dir in Path(IN_FOLDER).iterdir():
    if not speaker_dir.is_dir():
        continue
    for video_dir in speaker_dir.iterdir():
        if not video_dir.is_dir():
            continue
        for txt_file in video_dir.iterdir():

            with open(txt_file, "r") as f:
                text = f.read()
            # read 8. line
            start_text = text.split("\n")[7]
            # get start time
            start_time = start_text.split(" ")[0]
            start_time = int(start_time)
            # get end time from last line
            end_time = text.split("\n")[-2].split(" ")[0]
            end_time = int(end_time)
            data.append(
                (speaker_dir.name, video_dir.name, txt_file.name, start_time, end_time)
            )


df = pd.DataFrame(
    data, columns=["speaker", "video", "txt_file", "start_time", "end_time"]
)
df["length"] = (df["end_time"] - df["start_time"]) / 25

# sort by speaker and length
df = df.sort_values(by=["speaker", "length"], ascending=[True, False])
df

In [None]:
import yt_dlp
from pydub import AudioSegment


def download_audio(youtube_url, output_path="audio.mp3"):
    """Download audio from a YouTube video and save it to a file."""
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": output_path,
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    print(f"Audio downloaded as {output_path}")


def check_video_available(youtube_url):
    """Check if a YouTube video is available."""
    ydl_opts = {
        "quiet": True,
        "no_warnings": True,
        "format": "bestaudio/best",
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=False)
            return True
    except yt_dlp.utils.DownloadError as e:
        return
    return info is not None


def trim_audio(input_path, output_path, start_time, end_time):
    """Trim an audio file between start_time and end_time (in seconds)."""

    # Load the audio file
    audio = AudioSegment.from_file(input_path)

    # Convert times to milliseconds
    trimmed_audio = audio[start_time * 1000 : end_time * 1000]

    # Export the trimmed audio
    trimmed_audio.export(output_path, format="mp3")
    print(f"Trimmed audio saved as {output_path}")

In [None]:
import pandas as pd

# create record for each file from data/test_data folder with name
from pathlib import Path

data = []
for file in Path("../data/test_data").iterdir():
    data.append({"name": file.name})
df_dupl = pd.DataFrame(data)
df_dupl["user"] = df_dupl["name"].str.split("_").str[0]
df_dupl["video"] = df_dupl["name"].str.split("_").str[1]

# get subset of data that have duplicate user and video
users = (
    df_dupl[df_dupl.duplicated(subset=["user", "video"], keep="first")]
    .sort_values(by=["user", "video"])["user"]
    .unique()
)

In [None]:
users

In [None]:
# find all files in folder with substring in name
from pathlib import Path


def find_files_with_substring(folder, substring):
    files = []
    for file in Path(folder).iterdir():
        if substring in file.name:
            files.append(file)
    return files


find_files_with_substring("../data/test_data", "id00017")

In [None]:
# download two longest samples for each speaker each from different videos
# can take up to 1 hour
active_speaker = None
downloaded_samples = 0
last_video = None

for index, row in df.iterrows():
    speaker = row["speaker"]
    video = row["video"]
    txt_file = row["txt_file"].replace(".txt", "")
    start_time = row["start_time"]
    end_time = row["end_time"]

    if active_speaker != speaker:
        active_speaker = speaker
        downloaded_samples = 0

    # skip if the same video for the same speaker
    if last_video == video:
        continue

    # skip if already downloaded 2 samples
    if downloaded_samples >= 2:
        continue

    if speaker in users:
        pass
    else:
        continue

    print(f"Speaker: {speaker}, Video: {video}, Txt file: {txt_file}")

    # get youtube url
    youtube_url = f"https://www.youtube.com/watch?v={video}"
    output_name = f"{speaker}_{video}_{txt_file}"
    output_path = f"{output_name}.mp3"

    if not check_video_available(youtube_url):
        continue

    # download audio
    download_audio(youtube_url, output_name)

    # cut the audio
    # !ffmpeg -i {output_path} -ss {start_time} -to {end_time} -c copy {speaker}_{video}_cut.mp3
    trim_audio(
        output_path, f"{OUT_DIR}/{output_name}_cut.mp3", start_time / 25, end_time / 25
    )

    Path(output_path).unlink()
    downloaded_samples += 1
    last_video = video