<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#New-heading" data-toc-modified-id="New-heading-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>New heading</a></span></li></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#New-heading" data-toc-modified-id="New-heading-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>New heading</a></span></li></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#New-heading" data-toc-modified-id="New-heading-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>New heading</a></span></li></ul></div>

In [None]:
# default_exp exec.select_speakers

In [None]:
from dataclasses import dataclass
from collections import namedtuple


@dataclass
class Foo:
    foo: str = None
    bar: str = None
    baz: str = None


# Foo = namedtuple("Foo", ["bar", "baz", "blah"], defaults=(None,) * 3)
f = Foo("hi")
assert f.foo == "hi"
assert f.bar is None
assert f.baz is None

# New heading

In [None]:
# export
import argparse
from collections import namedtuple
from dataclasses import dataclass
import json
import os
from pathlib import Path
from shutil import copyfile, copytree
import sys
from typing import List

from uberduck_ml_dev.utils.audio import convert_to_wav
from uberduck_ml_dev.utils.utils import parse_vctk

STANDARD_MULTISPEAKER = "standard-multispeaker"
STANDARD_SINGLESPEAKER = "standard-singlespeaker"
VCTK = "vctk"
FORMATS = [
    STANDARD_MULTISPEAKER,
    STANDARD_SINGLESPEAKER,
    VCTK,
]


@dataclass
class Dataset:
    path: str
    format: str = STANDARD_MULTISPEAKER
    speakers: str = None


def _convert_vctk(f, out_path: str, ds: Dataset, start_speaker_id: int):
    assert ds.format == VCTK, "VCTK is the only format supported by this function!"
    vctk_data = parse_vctk(ds.path)
    if ds.speakers:
        speakers = ds.speakers.split(",")
    else:
        speakers = list(vctk_data.keys())
    speaker_id = start_speaker_id
    for speaker_name, speaker_data in vctk_data.items():
        if speaker_name not in speakers:
            continue
        speaker_out_path = Path(out_path) / speaker_name
        if not speaker_out_path.exists():
            os.makedirs(speaker_out_path)
        for transcription, flac_path in speaker_data:
            assert flac_path.endswith(".flac")
            # convert flac to wav in proper location
            basename = os.path.basename(flac_path).replace(".flac", ".wav")
            rel_path = Path(speaker_name) / basename
            convert_to_wav(flac_path, str(speaker_out_path / basename))
            line = f"{rel_path}|{transcription}|{speaker_id}\n"
            f.write(line)
        speaker_id += 1
    return speaker_id - start_speaker_id


def _convert_standard_multispeaker(
    f, out_path: str, ds: Dataset, start_speaker_id: int
):
    speaker_id = start_speaker_id
    if ds.speakers:
        speakers = ds.speakers.split(",")
    else:
        speakers = os.listdir(root)
    for speaker in speakers:
        path = Path(root) / Path(speaker)
        files = os.listdir(path)
        transcriptions, *_ = [f for f in files if f.endswith(".txt")]
        with (Path(root) / speaker / transcriptions).open("r") as txn_f:
            transcriptions = txn_f.readlines()
        for line in transcriptions:
            line = line.strip("\n")
            try:
                line_path, line_txn, *_ = line.split("|")
            except Exception as e:
                print(e)
                print(line)
                raise
            line = f"{str(Path(speaker) / Path(line_path))}|{line_txn}"
            f.write(f"{line}|{speaker_id}\n")
        wavs_dvc = path / "wavs.dvc"
        speaker_out_path = Path(out_path) / speaker
        if not speaker_out_path.exists():
            os.makedirs(speaker_out_path)
        # if wavs_dvc.exists():
        #     copyfile(wavs_dvc, speaker_out_path / "wavs.dvc")
        wavs_dir = path / "wavs"
        if wavs_dir.exists():
            # copytree(wavs_dir, speaker_out_path / "wavs")
            os.symlink(wavs_dir, speaker_out_path / "wavs", target_is_directory=True)
        speaker_id += 1
    return speaker_id - start_speaker_id


def _convert_to_multispeaker(f, out_path: str, ds: Dataset, start_speaker_id: int):
    assert ds.format in [
        STANDARD_MULTISPEAKER,
        VCTK,
    ], f"Supported formats: {STANDARD_MULTISPEAKER}, {VCTK}"
    root = ds.path

    print(ds.format)
    print(ds.path)
    print(ds.speakers)
    if ds.format == STANDARD_MULTISPEAKER:
        return _convert_standard_multispeaker(f, out_path, ds, start_speaker_id)
    elif ds.format == VCTK:
        return _convert_vctk(f, out_path, ds, start_speaker_id)


def select_speakers(datasets: List[Dataset], out_dir):
    speaker_id = 0
    out_path = Path(out_dir)
    if not out_path.exists():
        os.makedirs(out_path)
    with (out_path / "list.txt").open("w") as f:
        for ds in datasets:
            speaker_count = _convert_to_multispeaker(f, out_path, ds, speaker_id)
            speaker_id += speaker_count


def parse_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-o", "--out", help="Path to dataset out directory", default="./dataset"
    )
    parser.add_argument("--config", help="path to JSON config")
    parser.add_argument("-d", "--dataset", action="append", nargs="*")
    return parser.parse_args(args)


try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False

if __name__ == "__main__" and not IN_NOTEBOOK:
    #     args = parse_args(sys.argv[1:])
    #     if args.config:
    #         config = json.load(args.config)
    #         dataset = config["dataset"]
    #     elif args.dataset:
    #         dataset = args.dataset
    #     else:
    #         raise Exception("Dataset must be specified")
    #     dataset_collection = [Dataset(*d) for d in dataset]
    #     select_speakers(dataset_collection, args.out)
    args = parse_args(sys.argv[1:])
    dataset = args.dataset
    name = args.name
    numpoints = int(args.numpoints)
    outdir = args.outdir
    cores = int(args.cores)
    data = load_data(dataset, name, outdir, numpoints)
    natoms = data.shape[1]

In [None]:
args = parse_args(["-d", "test/one/two", "three,four,five", "-d", "foo/bar/baz"])
assert len(args.dataset) == 2
assert args.dataset == [["test/one/two", "three,four,five"], ["foo/bar/baz"]]
args = parse_args(["-d", "foo"])
assert len(args.dataset) == 1
assert args.dataset == [["foo"]]
[Dataset(*d) for d in [["test/one/two", "three,four,five"], ["foo/bar/baz"]]]

[Dataset(path='test/one/two', format='three,four,five', speakers=None),
 Dataset(path='foo/bar/baz', format='standard-multispeaker', speakers=None)]

In [None]:
args = parse_args(
    ["--dataset", "~/data/voice/dvc-managed", "standard-multispeaker", "brock-sampson"]
)

In [None]:
[Dataset(*d) for d in args.dataset]

[Dataset(path='~/data/voice/dvc-managed', format='standard-multispeaker', speakers='brock-sampson')]

In [None]:
# sam stuff
import sqlite3
import os
import pandas as pd


def parse_vctk2(folder):

    wav_dir = folder + "wav48_silence_trimmed"
    txt_dir = folder + "txt"
    speaker_wavs = os.listdir(wav_dir)
    speaker_txts = os.listdir(txt_dir)
    speakers = np.intersect1d(speaker_wavs, speaker_txts)

    output_dict = {}
    # wav_dict = {}
    # txt_dict = {}
    # speaker_dict = {}
    counter = 0
    namelist = np.zeros((0, 2), dtype=object)
    for speaker in speakers:

        speaker_wav_dir = wav_dir + "/" + speaker
        speaker_txt_dir = txt_dir + "/" + speaker
        wav_files_speaker = np.asarray(os.listdir(speaker_wav_dir))
        txt_files_speaker = np.asarray(os.listdir(speaker_txt_dir))
        # data_dict[wav_dir] = pd.DataFrame()

        wav_files = np.asarray([])
        nwavfiles = len(wav_files_speaker)
        list1 = np.asarray(
            [txt_files_speaker[i][:8] for i in range(len(txt_files_speaker))]
        )
        list2 = np.asarray([wav_files_speaker[i][:8] for i in range(nwavfiles)])
        mic = np.asarray([wav_files_speaker[i][12] for i in range(nwavfiles)])
        mic1_ind = mic == "1"
        wav_files_speaker = wav_files_speaker[mic1_ind]
        list2 = list2[mic1_ind]
        combined_files = np.intersect1d(list1, list2)
        matching_inds1 = np.where(np.isin(list1, combined_files))[0]
        matching_inds2 = np.where(np.isin(list2, combined_files))[0]
        inds1 = matching_inds1[list1[matching_inds1].argsort()]
        inds2 = matching_inds2[list2[matching_inds2].argsort()]
        txt_files_speaker = txt_files_speaker[inds1]
        wav_files_speaker = wav_files_speaker[inds2]
        texts = list()
        for g in range(len(txt_files_speaker)):
            text_file = speaker_txt_dir + "/" + txt_files_speaker[g]
            with open(text_file) as f:
                contents = f.read().splitlines()
            # print(contents)
            texts = np.append(texts, contents)

            wav_file = speaker_wav_dir + "/" + wav_files_speaker[g]
            wav_files = np.append(wav_files, wav_file)

        if wav_files.shape[0] > 0:
            output_dict[speaker] = pd.DataFrame(
                [wav_files, texts, np.repeat(counter, wav_files.shape[0])]
            ).transpose()
            namelist = np.vstack([namelist, np.asarray([[speaker, counter]])])
            counter = counter + 1

    output = pd.concat(list(output_dict.values()))
    return (output, namelist)


def load_filepaths_and_text(filename, split="|"):
    with open(filename, encoding="utf-8") as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    return filepaths_and_text


def synthesize_speakerids2(filelists, fix_indices_index=None):

    data_dict = {}
    data_dict_out = {}
    for f in range(len(filelists)):
        # data_dict[filelists[f]] = pd.read_csv(filelists[f], sep = ",",index_col=0,error_bad_lines=False)
        data = load_filepaths_and_text(filelists[f])
        data_dict[filelists[f]] = pd.DataFrame(data)
    # pd.read_csv(filelists[f], sep = "|",header=None, error_bad_lines=False)

    source_files = list(data_dict.keys())

    speaker_offset = {}
    nfilelist = len(filelists)
    reserved_speakers = np.unique(data_dict[filelists[fix_indices_index]].iloc[:, 2])

    for s in range(nfilelist):
        source_file = filelists[s]
        data = data_dict[source_file]
        if s != fix_indices_index:
            speakers = np.unique(data.iloc[:, 2])
            overlap = np.where(np.isin(speakers, reserved_speakers))[0]
            reserved_speakers_temp = np.union1d(speakers, reserved_speakers)
            newindices = np.setdiff1d(
                list(range(len(reserved_speakers) + len(speakers))),
                reserved_speakers_temp,
            )[: len(overlap)]
            for o in range(len(overlap)):
                data.iloc[np.where(data.iloc[:, 2] == overlap[o])[0], 2] = newindices[o]

            data_dict_out[source_file] = data
            speakers = np.unique(data.iloc[:, 2])
            reserved_speakers = np.union1d(speakers, reserved_speakers)
            # print(speakers,reserved_speakers)
        else:
            data_dict_out[source_file] = data
    return data_dict_out


def get_filelist(database, speakerjson):
    """
    Take a list of speakers and create a filelist
    """


def subset_speakers(database, seed, speakerlist=None, nspeakers=None):
    """
    Takes a filelist and saves another filelist with either a random subset of speakers or speakers from the list
    """

In [None]:
vctk_filelist2, namelist = parse_vctk2(vctk_folder)
print(namelist)

In [None]:
conn = sqlite3.connect("test.db")

conn.execute(
    """CREATE TABLE DATAINFO
         (ID INT PRIMARY KEY     NOT NULL,
         NAME           TEXT     NOT NULL,
         SOURCE         TEXT     NOT NULL,
         FILELIST       TEXT,
         SPEAKERID      INT);"""
)

conn.execute(
    "INSERT INTO DATAINFO (ID,NAME,SOURCE,FILELIST,SPEAKERID) \
      VALUES (1, 'eminem', 'uberduck', '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/eminem_all_processed.txt', 0)"
)

# id doesnt need to be included necessarily but makes sense
cursor = conn.execute("SELECT ID, NAME, SOURCE, FILELIST from DATAINFO")
for row in cursor:
    print("ID = ", row[0])
    print("NAME = ", row[1])
    print("ADDRESS = ", row[2])
    print("SALARY = ", row[3])

In [None]:
# this is an example of a speakerlist that would be loaded from a json
# a json could also be generated by querying the database e.g.
# subset_speakers(database, seed, speakerlist = None, nspeakers = None)
speakerlist = np.asarray([["vctk", "p302"], ["uberduck", "eminem"]])
print(speakerlist)

[['vctk' 'p302']
 ['uberduck' 'eminem']]


In [None]:
# now call get_filelist(database, speakerjson)
# this list (filelist locations) would be got from database
metalist_dir = "/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/metadata_collections"
metalist_files = os.listdir(metalist_dir)
# print(metalist_files)
train_ratios = np.ones(4) * 1.0
print(np.load(metalist_dir + "/" + metalist_files[0]))
print(np.load(metalist_dir + "/libritts_processed_file.npy"))
print(np.load(metalist_dir + "/uberduck_processed_files.npy", allow_pickle=True))
print(
    np.load(
        "/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/metadata_collections/vctk_processed_file.npy"
    )
)

In [None]:
# now synthesize selected speakers from filelists in database and synthesize selected (e.g. [2,4,5])
# something like the following, but with the ability to take only rows of multispeaker datasets corresponding to particular speakers
filelists = np.asarray([])
# files = np.asarray([])
for r in range(4):
    files = np.load(metalist_dir + "/" + metalist_files[r], allow_pickle=True)
    filelist = np.asarray([])

    if files.ndim > 0:
        nfiles = files.shape[0]
        for s in range(nfiles):
            filelist = np.append(filelist, files[s])
    else:
        filelist = files
    filelists = np.append(filelists, filelist)

print(filelists)
dd = synthesize_speakerids2(filelists, 1)
ad = list(dd.values())
ad2 = [ad[i] for i in [2, 4, 5]]
alldata = pd.concat(ad2)

['/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/lj7all_processed.txt'
 '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/librittsall_processed.txt'
 '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/eminem_all_processed.txt'
 '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/ben-shapiro_all_processed.txt'
 '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/jay-z_all_processed.txt'
 '/mnt/disks/uberduck-experiments-v0/uberduck-ml-dev/experiments/processed_metadata/vctkall_processed.txt']


  mask &= (ar1 != a)


In [None]:
print(alldata)

                                                       0  \
0      /mnt/disks/uberduck-experiments-v0/data/uberdu...   
1      /mnt/disks/uberduck-experiments-v0/data/uberdu...   
2      /mnt/disks/uberduck-experiments-v0/data/uberdu...   
3      /mnt/disks/uberduck-experiments-v0/data/uberdu...   
4      /mnt/disks/uberduck-experiments-v0/data/uberdu...   
...                                                  ...   
43880  /mnt/disks/uberduck-experiments-v0/data/vctk/w...   
43881  /mnt/disks/uberduck-experiments-v0/data/vctk/w...   
43882  /mnt/disks/uberduck-experiments-v0/data/vctk/w...   
43883  /mnt/disks/uberduck-experiments-v0/data/vctk/w...   
43884  /mnt/disks/uberduck-experiments-v0/data/vctk/w...   

                                                       1    2  
0            It's like this and like that and like this.    0  
1      I'm the illest rapper to hold the cordless, pa...    0  
2      I'm meaner in action than Roscoe beatin' James...    0  
3      I grew up in a w