## Functions and imports

In [None]:
import time
from pathlib import Path

import pandas as pd

from kth_sr.utils import get_df_by_downloaded_folder


def get_df(dir: str) -> pd.DataFrame:
    """Create a DataFrame from downloaded vox celeb txt files.
    Dataframe contains following columns:
    speaker: speaker id,
    video: video id,
    txt_file: txt file name,
    start_time: start frame of the video,
    end_time: end frame of the video,
    length: length of the video in seconds.

    Args:
        dir (str): path to txt folder
    """
    data = []
    for speaker_dir in Path(dir).iterdir():
        if not speaker_dir.is_dir():
            continue
        for video_dir in speaker_dir.iterdir():
            if not video_dir.is_dir():
                continue
            for txt_file in video_dir.iterdir():

                with open(txt_file, "r") as f:
                    text = f.read()
                # read 8. line
                start_text = text.split("\n")[7]
                # get start time
                start_time = start_text.split(" ")[0]
                start_time = int(start_time)
                # get end time from last line
                end_time = text.split("\n")[-2].split(" ")[0]
                end_time = int(end_time)
                data.append(
                    (
                        speaker_dir.name,
                        video_dir.name,
                        txt_file.name,
                        start_time,
                        end_time,
                    )
                )

    df = pd.DataFrame(
        data, columns=["speaker", "video", "txt_file", "start_time", "end_time"]
    )
    df["length"] = (df["end_time"] - df["start_time"]) / 25

    # sort by speaker and length
    return df.sort_values(by=["speaker", "length"], ascending=[True, False])

In [None]:
import yt_dlp
from pydub import AudioSegment


def download_audio(youtube_url, output_path="audio.mp3"):
    """Download audio from a YouTube video and save it to a file."""
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": output_path,
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
                "preferredquality": "192",
            }
        ],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    print(f"Audio downloaded as {output_path}")


def check_video_available(youtube_url):
    """Check if a YouTube video is available."""
    ydl_opts = {
        "quiet": True,
        "no_warnings": True,
        "format": "bestaudio/best",
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=False)
            return True
    except yt_dlp.utils.DownloadError:
        return
    return info is not None


def trim_audio(input_path, output_path, start_time, end_time):
    """Trim an audio file between start_time and end_time (in seconds)."""

    # Load the audio file
    audio = AudioSegment.from_file(input_path)

    # Convert times to milliseconds
    trimmed_audio = audio[start_time * 1000 : end_time * 1000]

    # Export the trimmed audio
    trimmed_audio.export(output_path, format="mp3")
    print(f"Trimmed audio saved as {output_path}")

In [None]:
def download_df_audio(
    df: pd.DataFrame, out_dir: str, n_samples: int = 2, start_index: int = 0
):
    """Download audio files from youtube urls in the DataFrame.
    Naming convention for the output files: {speaker}_{sample_number}_{duration}_{video}_{txt_file}_cut.mp3
    DataFrame should contain columns: speaker, video, txt_file, start_time, end_time.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        out_dir (str): Output directory for
        n_samples (int): Number of samples to download for each speaker.
        start_index (int): Start naming index for the output files.
    """

    active_speaker = None
    """Variable to keep track of the current speaker."""
    downloaded_samples = 0
    """Variable to keep track of the number of samples downloaded for the current speaker."""
    last_video = None
    """Variable to keep track of the last video downloaded for the current speaker."""

    Path(out_dir).mkdir(parents=True, exist_ok=True)

    # sort by speaker and length
    df = df.sort_values(by=["speaker", "length"], ascending=[True, False])

    for index, row in df.iterrows():
        speaker = row["speaker"]
        video = row["video"]
        txt_file = row["txt_file"].replace(".txt", "")
        start_time = row["start_time"]  # in frames
        end_time = row["end_time"]  # in frames
        duration = int(row["length"] * 1000)  # in milliseconds

        if active_speaker != speaker:
            active_speaker = speaker
            downloaded_samples = 0

        # skip if the same video for the same speaker
        if last_video == video:
            continue

        # skip if already downloaded 2 samples
        if downloaded_samples >= n_samples:
            continue

        print(f"Speaker: {speaker}, Video: {video}, Txt file: {txt_file}")

        # get youtube url
        youtube_url = f"https://www.youtube.com/watch?v={video}"
        output_name = (
            f"{speaker}_{downloaded_samples+start_index}_{duration}_{video}_{txt_file}"
        )
        output_path = f"{output_name}.mp3"

        if not check_video_available(youtube_url):
            continue

        # download audio
        for i in range(3):
            try:
                download_audio(youtube_url, output_name)
                break
            except Exception as e:
                print(f"Failed to download audio: {e}")
                # wait for 5 seconds
                time.sleep(5)
            if i == 2:
                raise Exception("Failed to download audio after 3 attempts")

        # cut the audio
        # !ffmpeg -i {output_path} -ss {start_time} -to {end_time} -c copy {speaker}_{video}_cut.mp3
        trim_audio(
            output_path,
            f"{out_dir}/{output_name}_cut.mp3",
            start_time / 25,
            end_time / 25,
        )

        Path(output_path).unlink()
        downloaded_samples += 1
        last_video = video

## Download txt data from the web

https://mm.kaist.ac.kr/datasets/voxceleb/index.html#testlist

download test data

```bash
curl -O https://mm.kaist.ac.kr/datasets/voxceleb/data/vox2_test_txt.zip
mkdir -p ../data/test
unzip vox2_test_txt.zip -d ../data/test
```

download dev data

```bash
curl -O https://mm.kaist.ac.kr/datasets/voxceleb/data/vox2_dev_txt.zip
mkdir -p ../data/dev
unzip vox2_dev_txt.zip -d ../data/dev
```

## Test dataset

In [None]:
IN_FOLDER = "../data/test/txt"

df_test = get_df(IN_FOLDER)
df_test

In [None]:
df_test.to_csv("../data/test.csv", index=False)

In [None]:
download_df_audio(df_test, "../data/test_data")

# Dev dataset



## Create dev dataframe

In [None]:
df_dev = get_df("../data/dev/txt/")
df_dev

In [None]:
df_dev.to_csv("../data/dev.csv", index=False)

## Download Most Famous Celebrities

In [None]:
import pandas as pd

NUM_OF_SAMPLES = 8
START_INDEX = 2
NUM_OF_CELEBS = 300

In [None]:
df_meta = pd.read_csv("../data/vox2_meta.csv")

# get NUM_OF_CELEBS celebs from dev set
df_meta = df_meta[df_meta["Set"] == "dev"]
df_meta = df_meta.sort_values(by="wiki_views_2024", ascending=False).head(NUM_OF_CELEBS)
df_meta

In [None]:
df_dev = pd.read_csv("../data/dev.csv")

# limit df_dev to celebs in df_meta
df_top_celebs = df_dev[df_dev["speaker"].isin(df_meta["VoxCeleb2_ID"])]
df_top_celebs

In [None]:
# remove all rows with longer length than already downloaded samples

already_downloaded = get_df_by_downloaded_folder("../data/download_df_audio/")
already_downloaded = already_downloaded.sort_values(by="speaker")
# get min length for each speaker
already_downloaded = already_downloaded.groupby("speaker").min().reset_index()

# Sort out the speakers rows rom df_top_celebs which are present in already_downloaded
ids = set(already_downloaded["speaker"].astype(str).str.strip())
filtered_df = df_top_celebs[~df_top_celebs["speaker"].astype(str).str.strip().isin(ids)]

df_top_celebs_filtered = df_top_celebs.merge(
    already_downloaded, on="speaker", how="left", suffixes=("", "_y")
)
df_top_celebs_filtered = df_top_celebs_filtered[
    df_top_celebs_filtered["length"] < df_top_celebs_filtered["duration_s"]
]

In [None]:
download_df_audio(
    df_top_celebs_filtered,
    "../data/top_celebs_data_2",
    n_samples=NUM_OF_SAMPLES,
    start_index=START_INDEX,
)

## Download all Dev audio files

In [None]:
import pandas as pd

df = pd.read_csv("../data/dev.csv")
df = df.sort_values(by=["speaker", "length"], ascending=[True, False])
# split df by speaker id to batches of 100 ids

# Get unique speakers
unique_speakers = df["speaker"].unique()

# Define batch size
batch_size = 100

# Create batches
batches = [
    unique_speakers[i : i + batch_size]
    for i in range(0, len(unique_speakers), batch_size)
]

# Create a DataFrame for each batch
batch_dfs = [df[df["speaker"].isin(batch)] for batch in batches]

In [None]:
# TODO set start_batch
start_batch = 7
for i, batch_df in enumerate(batch_dfs):
    if i < start_batch:
        continue

    out_dir = f"../data/dev_data/batch_{i}"
    download_df_audio(batch_df, out_dir)