<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#uberduck_ml_dev.exec.generate_filelist" data-toc-modified-id="uberduck_ml_dev.exec.generate_filelist-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>uberduck_ml_dev.exec.generate_filelist</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Supported-formats:" data-toc-modified-id="Supported-formats:-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Supported formats:</a></span></li><li><span><a href="#standard-multispeaker" data-toc-modified-id="standard-multispeaker-1.0.2"><span class="toc-item-num">1.0.2&nbsp;&nbsp;</span><code>standard-multispeaker</code></a></span></li><li><span><a href="#vctk" data-toc-modified-id="vctk-1.0.3"><span class="toc-item-num">1.0.3&nbsp;&nbsp;</span><code>vctk</code></a></span></li></ul></li></ul></li></ul></div>

In [None]:
# default_exp exec.generate_filelist

# uberduck_ml_dev.exec.generate_filelist

Convert a speech dataset to the standard dataset format.

Usage:

```
python -m uberduck_ml_dev.exec.generate_filelist \
    --input ~/multispeaker-root \
    --format standard-multispeaker \
    --ouput list.txt
```

### Supported formats:

### `standard-multispeaker`

```
root
  speaker1
    list.txt
    wavs
  speaker2
    list.txt
    wavs
```

### `vctk`

Format of the VCTK dataset as downloaded from the [University of Edinburgh](https://datashare.ed.ac.uk/handle/10283/3443).

```
root
  wav48_silence_trimmed
    p228
      p228_166_mic1.flac
      ...
  txt
    p228
      p228_166.txt
      ...
```







In [2]:
# export

import argparse
import os
from pathlib import Path

import sqlite3
from tqdm import tqdm
import pandas as pd

from uberduck_ml_dev.data.cache import ensure_speaker_table, insert_speaker
from uberduck_ml_dev.utils.utils import parse_vctk

CACHE_LOCATION = Path.home() / Path(".cache/uberduck/uberduck-ml-dev.db")

STANDARD_MULTISPEAKER = "standard-multispeaker"
STANDARD_SINGLESPEAKER = "standard-singlespeaker"
VCTK = "vctk"
FORMATS = [
    STANDARD_MULTISPEAKER,
    STANDARD_SINGLESPEAKER,
    VCTK,
]


def _convert_vctk(f, inp: str):
    vctk_data = parse_vctk(inp)
    speaker_id = 0
    conn = sqlite3.connect(str(CACHE_LOCATION))
    with conn:
        for speaker_name, speaker_data in tqdm(vctk_data.items()):
            insert_speaker(f.name, speaker_name, speaker_id, conn)
            speaker_out_path = Path(out_path) / speaker_name
            if not speaker_out_path.exists():
                os.makedirs(speaker_out_path)
            for transcription, flac_path in speaker_data:
                assert flac_path.endswith(".flac")
                wav_path = flac_path.replace(".flac", ".wav")
                convert_to_wav(flac_path, wav_path)
                full_path = Path(full_path).resolve()
                f.write(f"{full_path}|{transcription}|{speaker_id}\n")
            speaker_id += 1


def _convert_standard_multispeaker(f, inp: str):
    speaker_id = 0
    speakers = os.listdir(inp)
    conn = sqlite3.connect(str(CACHE_LOCATION))
    with conn:
        for speaker in tqdm(speakers):
            path = Path(inp) / Path(speaker)
            if not path.is_dir() or path.parts[-1].startswith("."):
                continue
            files = os.listdir(path)
            try:
                transcriptions, *_ = [f for f in files if f.endswith(".txt")]
            except:
                print(files)
                raise
            insert_speaker(f.name, speaker, speaker_id, conn)
            with (path / transcriptions).open("r") as txn_f:
                transcriptions = txn_f.readlines()
            for line in transcriptions:
                line = line.strip("\n")
                try:
                    line_path, line_txn, *_ = line.split("|")
                except Exception as e:
                    print(e)
                    print(line)
                    raise
                full_path = (path / line_path).resolve()
                f.write(f"{full_path}|{line_txn}|{speaker_id}\n")
            speaker_id += 1


def _convert_standard_multispeaker_rel(f, inp: str, rel_path: str):
    speaker_id = 0
    speakers = os.listdir(inp)
    print(speakers)
    conn = sqlite3.connect(str(CACHE_LOCATION))
    with conn:
        for speaker in tqdm(speakers):
            path = Path(inp) / Path(speaker)
            if not path.is_dir() or path.parts[-1].startswith("."):
                continue
            print(speaker)
            files = os.listdir(path)

            try:
                transcriptions, *_ = [f for f in files if f.endswith(".txt")]
            except:
                print(files)
                raise
            insert_speaker(f.name, speaker, speaker_id, conn)
            with (path / transcriptions).open("r") as txn_f:
                transcriptions = txn_f.readlines()
            for line in transcriptions:
                line = line.strip("\n")
                try:
                    line_path, line_txn, *_ = line.split("|")
                except Exception as e:
                    print(e)
                    print(line)
                    raise
                write_path = f"{rel_path}/{speaker}/{line_path}"
                f.write(f"{write_path}|{line_txn}|{speaker_id}\n")
            speaker_id += 1


def _generate_filelist(input_dataset, fmt, output_filelist):
    full_path = Path(output_filelist).resolve()
    ensure_speaker_table()
    with open(full_path, "w") as f:
        print(f.name)
        _convert_to_multispeaker(f, input_dataset, fmt)


def _generate_filelist_rel(input_dataset, output_filelist, rel_path):
    full_path = Path(output_filelist).resolve()
    ensure_speaker_table()
    with open(full_path, "w") as f:
        print(f.name)
        _convert_standard_multispeaker_rel(f, input_dataset, rel_path)


def _convert_to_multispeaker(f, inp: str, fmt: str):
    assert fmt in [
        STANDARD_MULTISPEAKER,
        VCTK,
    ], f"Supported formats: {STANDARD_MULTISPEAKER}, {VCTK}"

    if fmt == STANDARD_MULTISPEAKER:
        return _convert_standard_multispeaker(f, inp)
    elif fmt == VCTK:
        return _convert_vctk(f, inp)

In [3]:
CACHE_LOCATION.parent.exists()

True

In [33]:
# export

from typing import List
import sys


def _parse_args(args: List[str]):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Path to input dataset directory")
    parser.add_argument(
        "-f", "--format", help="Input dataset format", default=STANDARD_MULTISPEAKER
    )
    parser.add_argument("-o", "--output", help="asdf", default="list.txt")
    parser.add_argument("--rel_path", help="add relative path", default=None)
    return parser.parse_args(args)


try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False

if __name__ == "__main__" and not IN_NOTEBOOK:
    args = _parse_args(sys.argv[1:])
    if args.rel_path:
        _generate_filelist_rel(
            args.input, args.output, args.rel_path
        )  # make this for vctk / synthesize
    else:
        _generate_filelist(args.input, args.format, args.output)

In [32]:
# output = '/mnt/disks/uberduck-experiments-v0/test2/asdf'
# inp = '/mnt/disks/uberduck-experiments-v0/data/uberduck-multispeaker/'
# # rel_path = 'asdf'
# with open(output, "w") as f:
#     _convert_standard_multispeaker_rel(f, inp, rel_path)

['angelica', 'caesar', 'aether-genshin-impact', 'Parappa', 'Cow', 'cocoa-bandicoot', 'Chop_Chop_Master_Onion', 'Chicken', 'daisy-duck', 'benny', 'LICENSE', 'AshKetchum', '.dvc', 'CDiLink', 'kanye-rap', 'cersei', 'comic-book-guy', 'Chalmers', 'Daffy_Duck', 'brak', 'ClancyWiggum', 'avicii', 'Bubs', 'Bubble_Buddy', 'Blaze', 'cait', 'sam-lachow', 'command-conquer-eva', 'curie', 'Bubbles', 'big-gay-speaking', 'amy-rose', 'README.md', 'Chef', 'daenerys', 'Carl_Carlson', 'Bubsy', '.gitignore', 'CarolTea', 'eminem', 'b-la-b', 'KyleBrovlofski', 'CDiZelda', 'Numbuh1', 'Agnes_Skinner', 'Buttercup', 'DarkViperAU', 'relik-rapping', 'chihiro-fujisaki', 'bop-it-announcer', 'cream-the-rabbit', 'Adoring_Fan', 'cosmo', '.dvcignore', 'TAG', 'bullwinkle', '.git', 'big-gay-rapping', 'Carl_Wheezer', 'ideal-minded', 'jay-z', 'brother-bear', 'Blossom', 'alfur', 'BarneyGumble', 'jsxi', 'list.txt', 'Anne_Robinson', 'Candy_Cadet', 'da-games-singing', 'Chuckie_Finster', 'Barney', 'Baldi', 'TATII', 'cabal', 'brock

100%|████████████████████████████████████████████| 85/85 [00:00<00:00, 2190.21it/s]

angelica
caesar
aether-genshin-impact
Parappa
Cow
cocoa-bandicoot
Chop_Chop_Master_Onion
Chicken
daisy-duck
benny
AshKetchum
CDiLink
kanye-rap
cersei
comic-book-guy
Chalmers
Daffy_Duck
brak
ClancyWiggum
avicii
Bubs
Bubble_Buddy
Blaze
cait
sam-lachow
command-conquer-eva
curie
Bubbles
big-gay-speaking
amy-rose
Chef
daenerys
Carl_Carlson
Bubsy
CarolTea
eminem
b-la-b
KyleBrovlofski
CDiZelda
Numbuh1
Agnes_Skinner
Buttercup
DarkViperAU
relik-rapping
chihiro-fujisaki
bop-it-announcer
cream-the-rabbit
Adoring_Fan
cosmo
TAG
bullwinkle
big-gay-rapping
Carl_Wheezer
ideal-minded
jay-z
brother-bear
Blossom
alfur
BarneyGumble
jsxi
Anne_Robinson
Candy_Cadet
da-games-singing
Chuckie_Finster
Barney
Baldi
TATII
cabal
brock-samson
Bender_Rodriguez
Copy_Protector
Coach_Z
bam_margera
CaptFussenpepper
Amethyst
eeyore
Mandark
Brain





In [None]:
# hide
_parse_args(["-i", "foo/bar"])

Namespace(format='standard-multispeaker', input='foo/bar', output='list.txt')

In [None]:
# skip
from tempfile import NamedTemporaryFile, TemporaryFile

with NamedTemporaryFile("w") as f:
    _generate_filelist(
        str(Path("/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/").resolve()),
        "standard-multispeaker",
        f.name,
    )

with TemporaryFile("w") as f:
    _convert_to_multispeaker(
        f,
        str(Path("/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/").resolve()),
        "standard-multispeaker",
    )

  5%|▍         | 4/85 [00:00<00:02, 35.22it/s]

/private/var/folders/9n/w8qrq4mx4cl_j036z8wg_9nh0000gp/T/tmp8xrrhl9s


100%|██████████| 85/85 [00:00<00:00, 94.09it/s] 
100%|██████████| 85/85 [00:00<00:00, 96.20it/s] 
