<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#uberduck_ml_dev.exec.generate_filelist" data-toc-modified-id="uberduck_ml_dev.exec.generate_filelist-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>uberduck_ml_dev.exec.generate_filelist</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Supported-formats:" data-toc-modified-id="Supported-formats:-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Supported formats:</a></span></li><li><span><a href="#standard-multispeaker" data-toc-modified-id="standard-multispeaker-1.0.2"><span class="toc-item-num">1.0.2&nbsp;&nbsp;</span><code>standard-multispeaker</code></a></span></li><li><span><a href="#vctk" data-toc-modified-id="vctk-1.0.3"><span class="toc-item-num">1.0.3&nbsp;&nbsp;</span><code>vctk</code></a></span></li></ul></li></ul></li></ul></div>

In [1]:
# default_exp exec.generate_filelist

# uberduck_ml_dev.exec.generate_filelist

Convert a speech dataset to the standard dataset format.

Usage:

```
python -m uberduck_ml_dev.exec.generate_filelist \
    --input ~/multispeaker-root \
    --format standard-multispeaker \
    --ouput list.txt
```

### Supported formats:

### `standard-multispeaker`

```
root
  speaker1
    list.txt
    wavs
  speaker2
    list.txt
    wavs
```

### `vctk`

Format of the VCTK dataset as downloaded from the [University of Edinburgh](https://datashare.ed.ac.uk/handle/10283/3443).

```
root
  wav48_silence_trimmed
    p228
      p228_166_mic1.flac
      ...
  txt
    p228
      p228_166.txt
      ...
```







In [11]:
# export

import argparse
import os
from pathlib import Path

import sqlite3
from tqdm import tqdm

from uberduck_ml_dev.data.cache import ensure_speaker_table, insert_speaker
from uberduck_ml_dev.utils.utils import parse_vctk

CACHE_LOCATION = Path.home() / Path(".cache/uberduck/uberduck-ml-dev.db")
STANDARD_MULTISPEAKER = "standard-multispeaker"
STANDARD_SINGLESPEAKER = "standard-singlespeaker"
VCTK = "vctk"
FORMATS = [
    STANDARD_MULTISPEAKER,
    STANDARD_SINGLESPEAKER,
    VCTK,
]


def _convert_vctk(f, inp: str):
    vctk_data = parse_vctk(inp)
    speaker_id = 0
    conn = sqlite3.connect(str(CACHE_LOCATION))
    with conn:
        for speaker_name, speaker_data in tqdm(vctk_data.items()):
            insert_speaker(f.name, speaker_name, speaker_id, conn)
            speaker_out_path = Path(out_path) / speaker_name
            if not speaker_out_path.exists():
                os.makedirs(speaker_out_path)
            for transcription, flac_path in speaker_data:
                assert flac_path.endswith(".flac")
                wav_path = flac_path.replace(".flac", ".wav")
                convert_to_wav(flac_path, wav_path)
                full_path = Path(full_path).resolve()
                f.write(f"{full_path}|{transcription}|{speaker_id}\n")
            speaker_id += 1


def _convert_standard_multispeaker(f, inp: str, rel_path: str = None):
    speaker_id = 0
    speakers = os.listdir(inp)
    conn = sqlite3.connect(str(CACHE_LOCATION))
    with conn:
        for speaker in tqdm(speakers):
            path = Path(inp) / Path(speaker)
            if not path.is_dir() or path.parts[-1].startswith("."):
                continue
            files = os.listdir(path)
            try:
                transcriptions, *_ = [f for f in files if f.endswith(".txt")]
            except:
                print(files)
                raise
            insert_speaker(f.name, speaker, speaker_id, conn)
            with (path / transcriptions).open("r") as txn_f:
                transcriptions = txn_f.readlines()
            for line in transcriptions:
                line = line.strip("\n")
                try:
                    line_path, line_txn, *_ = line.split("|")
                except Exception as e:
                    print(e)
                    print(line)
                    raise
                if rel_path is not None:
                    out_path = (rel_path / line_path).resolve()
                else:
                    out_path = (path / line_path).resolve()
                f.write(f"{out_path}|{line_txn}|{speaker_id}\n")
            speaker_id += 1


def _convert_singlespeaker(f, inp: str, rel_path: str = None):
    speaker_id = 0
    conn = sqlite3.connect(str(CACHE_LOCATION))
    path = Path(inp)
    speaker = path.parts[-2]
    with conn:
        insert_speaker(str(f.name), speaker, speaker_id, conn)
        with (path).open("r") as txn_f:
            transcriptions = txn_f.readlines()
        for line in transcriptions:
            line = line.strip("\n")
            try:
                line_path, line_txn, *_ = line.split("|")
            except Exception as e:
                print(e)
                print(line)
                raise

            full_path = (Path(*path.parts[:-1]) / line_path).resolve()
            f.write(f"{full_path}|{line_txn}|{speaker_id}\n")


def _generate_filelist(input_, fmt, output_filelist, rel_path):
    full_path = Path(output_filelist).resolve()
    ensure_speaker_table()
    with open(full_path, "w") as f:
        print(f.name)
        # _convert_to_multispeaker(f, input_, fmt, rel_path)


def _convert_to_multispeaker(f, inp: str, fmt: str):
    assert fmt in [
        STANDARD_MULTISPEAKER,
        VCTK,
        STANDARD_SINGLESPEAKER,
    ], f"Supported formats: {STANDARD_MULTISPEAKER}, {VCTK}, {STANDARD_SINGLESPEAKER}"

    if fmt == STANDARD_MULTISPEAKER:
        return _convert_standard_multispeaker(f, inp)
    elif fmt == VCTK:
        return _convert_vctk(f, inp)
    elif fmt == STANDARD_SINGLESPEAKER:
        return _convert_singlespeaker(f, inp)

In [7]:
CACHE_LOCATION.parent.exists()

True

In [8]:
# export

from typing import List
import sys


def _parse_args(args: List[str]):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Path to input dataset directory")
    parser.add_argument(
        "-f", "--format", help="Input dataset format", default=STANDARD_MULTISPEAKER
    )
    parser.add_argument("-o", "--output", help="Output filename", default="list.txt")
    parser.add_argument(
        "--rel_path", help="Relative path of audiofiles in output", default=None
    )
    return parser.parse_args(args)


try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False

if __name__ == "__main__" and not IN_NOTEBOOK:
    args = _parse_args(sys.argv[1:])
    _generate_filelist(
        args.input, args.format, args.output, args.rel_path
    )  # make this for vctk

In [None]:
# in the old workflow, 1) datasets are converted to multispeaker and relative path added
# 2) the actual filelist with speakers selected speakers is generated
# Relative path and multispeaker identity depend on dataset deployment,
# Thus, in the new workflow, we add speaker IDs and relative paths after pulling from database
# There is still need for format specific rel_path adds
# Therefore, rel_path info is kept in the db as well
# configs for filelist generation from database require

# to do
# add speaker name directly as catagorical variable rather than integer,
# make full db with transcripts

# python -m uberduck_ml_dev.exec.generate_filelist -i "/mnt/disks/uberduck-experiments-v0/data/zwf/zwf/list.txt" -o "/mnt/disks/uberduck-experiments-v0/data/zwf_multi" -f "standard-singlespeaker"

In [None]:
# 3 types of possible input
# input_ = "/mnt/disks/uberduck-experiments-v0/data/vctk/"
input_ = "/mnt/disks/uberduck-experiments-v0/data/uberduck-multispeaker/"
input_ = "/mnt/disks/uberduck-experiments-v0/data/zwf/zwf/list.txt"

In [12]:
output = "/mnt/disks/uberduck-experiments-v0/uberduck-ml-exp/filelists/zwf_rel.txt"
rel_path = "/root/bucket/data/zwf/"
fmt = "standard-singlespeaker"
_generate_filelist(
    input_,
    fmt,
    output,
)

TypeError: _generate_filelist() missing 1 required positional argument: 'rel_path'

In [9]:
speaker_data = _parse_vctk("/mnt/disks/uberduck-experiments-v0/data/vctk/")

NameError: name '_parse_vctk' is not defined

In [None]:
_log_filelists(file, fmt)
_generate_filelist(config)

In [170]:
output_filelist = "/mnt/disks/uberduck-experiments-v0/test2/outie2"
fmt = "standard-singlespeaker"
_generate_filelist(input_, fmt, output_filelist)

# _convert_singlespeaker(f, inp)

/mnt/disks/uberduck-experiments-v0/test2/outie2
zwf speaker
('/', 'mnt', 'disks', 'uberduck-experiments-v0', 'data', 'zwf', 'zwf') pathparts
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt/disks/uberduck-experiments-v0/data/zwf/zwf savename
/mnt

In [None]:
# hide
_parse_args(["-i", "foo/bar"])

Namespace(format='standard-multispeaker', input='foo/bar', output='list.txt')

In [None]:
# skip
from tempfile import NamedTemporaryFile, TemporaryFile

with NamedTemporaryFile("w") as f:
    _generate_filelist(
        str(Path("/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/").resolve()),
        "standard-multispeaker",
        f.name,
    )

with TemporaryFile("w") as f:
    _convert_to_multispeaker(
        f,
        str(Path("/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/").resolve()),
        "standard-multispeaker",
    )

  5%|▍         | 4/85 [00:00<00:02, 35.22it/s]

/private/var/folders/9n/w8qrq4mx4cl_j036z8wg_9nh0000gp/T/tmp8xrrhl9s


100%|██████████| 85/85 [00:00<00:00, 94.09it/s] 
100%|██████████| 85/85 [00:00<00:00, 96.20it/s] 
