Other examples of dataset:
* [torchvision](https://github.com/pytorch/vision/blob/master/torchvision/datasets/mnist.py)

In [144]:
import torch
import torchvision 
import torchaudio

import os

import random
from functools import reduce, partial
from warnings import warn

# Cache

In [145]:
import pickle

class Cache:
    def __init__(self, generator, location):
        self.generator = generator
        self.location = location

        self._id = id(self)
        self._cache = []
        self._internal_index = 0

    def __iter__(self):
        self._internal_index = 0
        return self

    def __next__(self):
        if self._internal_index < len(self):
            item = self[self._internal_index]
        else:
            item = next(self.generator)
        
            file = str(self._id) + "-" + str(len(self))
            file = os.path.join(self.location, file)
            self._cache.append(file)
        
            os.makedirs(self.location, exist_ok=True)
            with open(file, 'wb') as file:
                pickle.dump(item, file)

        self._internal_index += 1
        return item
    
    def __getitem__(self, index):
        file = self._cache[index]
        with open(file, 'rb') as file:
            item = pickle.load(file)
        return item
    
    def __len__(self):
        # Return length of cache
        return len(self._cache)

In [146]:
def gen():
    for i in range(0,2):
        yield i
        
cache = Cache(gen(), "tmp/")

for c in cache:
    print(c)

0
1


# Common tools

In [147]:
def compose(*funcs):
    return lambda x: reduce(lambda f, g: g(f), list(funcs), x)

def download(urls, root_path):
    for url, folder in urls:
        # torchvision.datasets.utils.download_url(url, root_path)
        file = os.path.join(root_path, os.path.basename(url))
        yield file, folder
    
def extract(files):
    for file, folder in files:
        # torchvision.datasets.utils.extract_archive(file)
        path = os.path.dirname(file)
        path = os.path.join(path, folder)
        yield path
            
def walk(paths, extension):
    for path in paths:
        for dp, dn, fn in os.walk(path):
            for f in fn:
                if extension in f:
                    yield path, f

def shuffle(generator):
    # Need to load the whole list in memory
    generator = list(generator)
    # print(len(generator))
    random.shuffle(generator)
    for g in generator:
        yield g

def filtering(fileids, reference):
    
    path_old = ""
    
    for path, fileid in fileids:
        
        if path != path_old:
            # Check if same path to avoid reloading the file constantly
            ref = os.path.join(path, reference)
            with open(ref) as ref:
                r = "".join(ref.readlines())

        # It would be more efficient to loop through the reference file instead
        if fileid in r:
            yield path, fileid

In [148]:
def load_yesno(fileids):
    extension = ".wav"
    for path, fileid in fileids:
        file = os.path.join(path, fileid)
        waveform, sample_rate = torchaudio.load(file)
        label = os.path.basename(fileid).split(".")[0].split("_")
    
        yield label, waveform, sample_rate
        

def YESNO(root):
    
    url = [
        ("http://www.openslr.org/resources/1/waves_yesno.tar.gz", "waves_yesno")
    ]
     
    pipeline = compose(
        partial(download, root_path=root),
        extract,
        partial(walk, extension=".wav"),
        shuffle,
        load_yesno,
    )
    
    return Cache(pipeline(url), "tmp/")


data = YESNO("/Users/vincentqb/yesnotest")

next(data)
data[0]

(['1', '1', '1', '1', '1', '1', '1', '1'],
 tensor([[-3.0518e-05,  6.1035e-05,  3.0518e-05,  ...,  3.9673e-04,
           5.4932e-04,  9.1553e-04]]),
 8000)

In [149]:
def load_vctk(fileids):
    txt_folder = "txt"
    txt_extension = ".txt"
    
    audio_folder = "wav48"
    audio_extension = ".wav"
    
    for path, fileid in fileids:
        
        fileid = os.path.basename(fileid).split(".")[0]
        folder = fileid.split("_")[0]
        txt_file = os.path.join(path, txt_folder, folder, fileid + txt_extension)        
        audio_file = os.path.join(path, audio_folder, folder, fileid + audio_extension)        
        
        try:
            with open(txt_file) as txt_file:
                content = txt_file.readlines()[0]
        except FileNotFoundError:
            warn("Translation not found for {}".format(audio_file))
            # warn("File not found: {}".format(txt_file))
            continue

        waveform, sample_rate = torchaudio.load(audio_file)
        
        yield fileid, content, waveform, sample_rate
        
        
def VCTK(root):
    
    url = [
        ('http://homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz', "VCTK-Corpus/")
    ]
    
    pipeline = compose(
        partial(download, root_path=root),
        extract,
        partial(walk, extension=".wav"),
        shuffle,
        load_vctk,
    )
    
    return Cache(pipeline(url), "tmp/")


data = VCTK("/Users/vincentqb/vctktest/")

next(data)
data[0]

('p282_342',
 'In addition, it was in breach of natural justice.\n',
 tensor([[-0.0035, -0.0047, -0.0041,  ..., -0.0043, -0.0050, -0.0036]]),
 48000)

In [150]:
def load_librispeech(fileids):
    text_extension = ".trans.txt"
    audio_extension = ".flac"
    for data_path, fileid in fileids:
        fileid = os.path.basename(fileid).split(".")[0]
        folder1, folder2, file = fileid.split("-")
        file_text = folder1 + "-" + folder2 + text_extension
        file_text = os.path.join(data_path, folder1, folder2, file_text)
        file_audio = folder1 + "-"+ folder2 + "-" + file + audio_extension
        file_audio = os.path.join(data_path, folder1, folder2, file_audio)
        waveform, sample_rate = torchaudio.load(file_audio)
        
        found = False
        for line in open(file_text):
            fileid_text, content = line.strip().split(" ", 1)
            if fileid == fileid_text:
                found = True
                break
        if not found:
            from warnings import warn
            warn("Translation not found for {}.".format(fileid))
            continue

        yield fileid, waveform, sample_rate, content
        

def LIBRISPEECH(root, selection="dev-clean"):
    
    # http://www.openslr.org/resources/12/dev-clean.tar.gz
    # http://www.openslr.org/resources/12/test-clean.tar.gz
    # http://www.openslr.org/resources/12/test-other.tar.gz
    # http://www.openslr.org/resources/12/train-clean-100.tar.gz
    # http://www.openslr.org/resources/12/train-clean-360.tar.gz
    # http://www.openslr.org/resources/12/train-other-500.tar.gz

    selections = [
        "dev-clean",
        "test-clean",
        "test-other",
        "train-clean-100",
        "train-clean-360",
        "train-other-500"
    ]
        
    base = "http://www.openslr.org/resources/12/"
    url = [
        (os.path.join(base, selection + ".tar.gz"), os.path.join("LibriSpeech", selection))
    ]
     
    pipeline = compose(
        partial(download, root_path=root),
        extract,
        partial(walk, extension=".flac"),
        shuffle,
        load_librispeech,
    )

    return Cache(pipeline(url), "tmp/")


data = LIBRISPEECH("/Users/vincentqb/librispeechtest/")

next(data)
data[0]

('7976-105575-0003',
 tensor([[ 3.0518e-05, -3.0518e-05,  0.0000e+00,  ...,  2.4414e-04,
           2.4414e-04,  1.2207e-04]]),
 16000,
 "THERE WERE NO BREASTWORKS YET THAT ONE LITTLE BRIGADE OF HAMILTON'S DIVISION STOOD THERE IN THE OPEN AND REPULSED ASSAULT AFTER ASSAULT")

In [151]:
def load_commonvoice(fileids, tsv):

    for path, fileid in fileids:
        filename = os.path.join(path, "clips", fileid)
        tsv = os.path.join(path, tsv)
        
        found = False
        for line in open(tsv):
            if fileid in line:
                line = line.strip().split("\t")
                found = True
                break
        if not found:
            continue

        output = torchaudio.load(filename)    
        line.extend(output)
        yield line
        

def COMMONVOICE(root, language="tatar", tsv="train.tsv"):
    web = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/"

    languages = {
        "tatar": "tt",
        "english": "en",
        "german": "de",
        "french": "fr",
        "welsh": "cy",
        "breton": "br",
        "chuvash": "cv",
        "turkish": "tr",
        "kyrgyz": "ky",
        "irish": "ga-IE",
        "kabyle": "kab",
        "catalan": "ca",
        "taiwanese": "zh-TW",
        "slovenian": "sl",
        "italian": "it",
        "dutch": "nl",
        "hakha chin": "cnh",
        "esperanto": "eo",
        "estonian": "et",
        "persian": "fa",
        "basque": "eu",
        "spanish": "es",
        "chinese": "zh-CN",
        "mongolian": "mn",
        "sakha": "sah",
        "dhivehi": "dv",
        "kinyarwanda": "rw",
        "swedish": "sv-SE",
        "russian": "ru",
    }

    url = web + languages[language] + ".tar.gz"
    url = [(url, "")]
     
    pipeline = compose(
        partial(download, root_path=root),
        extract,
        partial(walk, extension=".mp3"),
        # partial(filtering, reference=tsv),
        shuffle,
        partial(load_commonvoice, tsv=tsv),
    )
    
    return Cache(pipeline(url), "tmp/")


data = COMMONVOICE("/Users/vincentqb/commonvoicetest/")

next(data)
data[0]

['bb10e83bdf015da18144f427509d8cb56cfa4884527dc0cb3da927c845b733e48d3c451ae9538723b747fd6e34b15a863635e71b09a7611b7484f09e4cd109be',
 'common_voice_tt_17733179.mp3',
 'Белгән мең бәладән котылган.',
 '2',
 '0',
 'thirties',
 'male',
 tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.8723e-05,
          -3.7115e-05, -5.9720e-05]]),
 48000]

# yesno

In [None]:
# yesno
# filename = "waves_yesno.tar.gz"
# url = "http://www.openslr.org/resources/1/" + filename

url = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
filename = os.path.basename(url)
root = "/Users/vincentqb/yesnotest/"

inpath = root + filename
dset_abs_path = root + "waves_yesno/"

In [None]:
torchvision.datasets.utils.download_url(url, root)

# torchvision.datasets.utils.extract_archive(inpath, root)  # FIXME no extraction with root
torchvision.datasets.utils.extract_archive(inpath)

In [None]:
def iterate_yesno(root):
    for f in os.listdir(root):
        if ".wav" in f:
            yield os.path.join(root, f)

def load_yesno(file_generator):
    for file in file_generator:

        waveform, sample_rate = torchaudio.load(file)
        label = os.path.basename(file).split(".", 1)[0].split("_")
    
        yield label, waveform, sample_rate

g = load_yesno(iterate_yesno(dset_abs_path))

In [None]:
len(list(g))

# vctk

In [None]:
filename1 = "DS_10283_2651.zip"
filename2 = "VCTK-Corpus.zip"
url = "http://datashare.is.ed.ac.uk/download/" + filename1
root = "/Users/vincentqb/vctktest/"

inpath1 = root + filename1
inpath2 = root + filename2
dset_abs_path = root + "VCTK-Corpus/"

In [None]:
# torchvision.datasets.utils.download_url(url, root)

# torchvision.datasets.utils.extract_archive(inpath, root)  # FIXME no extraction with root
torchvision.datasets.utils.extract_archive(inpath1)
torchvision.datasets.utils.extract_archive(inpath2)

In [None]:
def iterate_vctk(root):
    folder_txt = os.path.join(root, "txt/")
    folder_wav = os.path.join(root, "wav48/")
    
    for dp, _, fn in os.walk(folder_txt):
        for f in fn:
            if ".txt" in f:
                file_txt = os.path.join(dp, f)
                base = os.path.basename(file_txt).split(".", 1)[0]
                folder = base.split("_")[0]
                file_audio = os.path.join(folder_wav, folder, base) + ".wav"
            
                yield base, file_txt, file_audio


def load_vctk(file_generator):
    for base, file_txt, file_audio in file_generator:
            
        with open(file_txt) as file_txt:
            content = file_txt.readlines()[0]

        waveform, sample_rate = torchaudio.load(file_audio)

        yield base, content, waveform, sample_rate


g = load_vctk(iterate_vctk(dset_abs_path))

In [None]:
next(g)

# LibriSpeech

In [None]:
# http://www.openslr.org/12
# http://www.openslr.org/resources/12/dev-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-other.tar.gz
# http://www.openslr.org/resources/12/train-clean-100.tar.gz
# http://www.openslr.org/resources/12/train-clean-360.tar.gz
# http://www.openslr.org/resources/12/train-other-500.tar.gz

class LibriSpeech(object):

    url_base = "http://www.openslr.org/resources/12/"
    url_extension = ".tar.gz"
    audio_extension = ".flac"
    text_extension = ".trans.txt"

    _selection = [
        "dev-clean",
        "test-clean",
        "test-other",
        "train-clean-100",
        "train-clean-360",
        "train-other-500"
    ]
    
    in_archive_folder = "LibriSpeech"

    def __init__(self, selection, root_path, extracted=True, shuffle=False):

        if selection not in self._selection:
            raise ValueError

        self.root_path = root_path
        self.selection = selection
        self.extracted = extracted

        self.url = self.url_base + self.selection + self.url_extension
        
        if shuffle:
            c = compose(self.download, self.extract, self.walk, self.shuffle, self.load)
        else:
            c = compose(self.download, self.extract, self.walk, self.load)

        # Initialize here or in __iter__
        self._iterator = c([selection])

    def __iter__(self):
        return self

    def __next__(self):
        return self._iterator.__next__()
        
    def download(self, selections):
        for selection in selections:
            url = os.path.join(self.url_base, selection + self.url_extension)
            # torchvision.datasets.utils.download_url(url, self.root_path)
            yield selection
    
    def extract(self, selections):
        for selection in selections:
            archive_path = os.path.join(self.root_path, selection + self.url_extension)
            # torchvision.datasets.utils.extract_archive(archive_path)
            yield os.path.join(self.root_path, self.in_archive_folder, selection)

    def walk(self, paths):
        for path in paths:
            for dp, dn, fn in os.walk(path):
                for f in fn:
                    if self.audio_extension in f:
                        yield path, os.path.basename(f).split(".")[0]
    
    @staticmethod
    def shuffle(generator):
        generator = list(generator)
        random.shuffle(generator)
        for g in generator:
            yield g
        
    def load(self, fileids):
        for data_path, fileid in fileids:
            folder1, folder2, file = fileid.split("-")
            file_text = folder1 + "-" + folder2 + self.text_extension
            file_text = os.path.join(data_path, folder1, folder2, file_text)
            file_audio = folder1 + "-"+ folder2 + "-" + file + self.audio_extension
            filase_audio = os.path.join(data_path, folder1, folder2, file_audio)
            waveform, sample_rate = torchaudio.load(file_audio)
        
            found = False
            for line in open(file_text):
                fileid_text, content = line.strip().split(" ", 1)
                if fileid == fileid_text:
                    found = True
                    break
            if not found:
                warn("Audio file without translation: {}.".format(fileid))
                continue

            yield fileid, waveform, sample_rate, content


ls = LibriSpeech("dev-clean", "/Users/vincentqb/librispeechtest/")
next(ls) 

# librispeech

In [None]:
# http://www.openslr.org/12
# http://www.openslr.org/resources/12/dev-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-other.tar.gz
# http://www.openslr.org/resources/12/train-clean-100.tar.gz
# http://www.openslr.org/resources/12/train-clean-360.tar.gz
# http://www.openslr.org/resources/12/train-other-500.tar.gz


url_base = "http://www.openslr.org/resources/12/"
url_extension = ".tar.gz"

selection = [
    "dev-clean",
    "test-clean",
    "test-clean",
    "test-other",
    "train-clean-100",
    "train-clean-360",
    "train-other-500"
]

url = url_base + selection[0] + url_extension
root = "/Users/vincentqb/librispeechtest/"

filename = os.path.basename(url)
inpath = root + filename

base = filename.split(".")[0]
dset_abs_path = root + "LibriSpeech/" + base + "/"

In [None]:
# torchvision.datasets.utils.download_url(url, root)
# torchvision.datasets.utils.extract_archive(inpath)

In [None]:
def iterate_librispeech(root):
    for dp, dn, fn in os.walk(root):
        for f in fn:
            if ".txt" not in f:
                continue
        
            fp = os.path.join(dp, f)
            for line in open(fp):
                fileid, content = line.strip().split(" ", 1)
                audio = os.path.join(dp, fileid) + ".flac"
                waveform, sample_rate = torchaudio.load(audio)
                yield fileid, waveform, sample_rate, content

g = iterate_librispeech(dset_abs_path)

In [None]:
def iterate_librispeech(root):
    for dp, dn, fn in os.walk(root):
        for f in fn:
            if ".txt" not in f:
                continue
        
            fp = os.path.join(dp, f)
            for line in open(fp):
                fileid, _ = line.strip().split(" ", 1)
                yield root, fileid

def load_librispeech(file_generator):
    for root, fileid in file_generator:
        folder1, folder2, file = fileid.split("-")
        file_text = os.path.join(root, folder1, folder2, folder1) + "-" + folder2 + ".trans.txt"
        file_audio = os.path.join(root, folder1, folder2, folder1) + "-" + folder2 + "-" + file + ".flac"
        waveform, sample_rate = torchaudio.load(file_audio)
        
        found = False
        for line in open(file_text):
            fileid_text, content = line.strip().split(" ", 1)
            if fileid == fileid_text:
                found = True
                break
        if not found:
            raise Error

        yield fileid, waveform, sample_rate, content

g = load_librispeech(iterate_librispeech(dset_abs_path))

In [None]:
next(g)

# CommonVoice

In [None]:
web = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/"

languages = {
    "tatar": "tt",
    "english": "en",
    "german": "de",
    "french": "fr",
    "welsh": "cy",
    "breton": "br",
    "chuvash": "cv",
    "turkish": "tr",
    "kyrgyz": "ky",
    "irish": "ga-IE",
    "kabyle": "kab",
    "catalan": "ca",
    "taiwanese": "zh-TW",
    "slovenian": "sl",
    "italian": "it",
    "dutch": "nl",
    "hakha chin": "cnh",
    "esperanto": "eo",
    "estonian": "et",
    "persian": "fa",
    "basque": "eu",
    "spanish": "es",
    "chinese": "zh-CN",
    "mongolian": "mn",
    "sakha": "sah",
    "dhivehi": "dv",
    "kinyarwanda": "rw",
    "swedish": "sv-SE",
    "russian": "ru",
}

url = web + languages["tatar"] + ".tar.gz"
root = "/Users/vincentqb/commonvoicetest/"

filename = os.path.basename(url)
inpath = root + filename

base = filename.split(".")[0]
dset_abs_path = root
tsv = "train.tsv"

In [None]:
# torchvision.datasets.utils.download_url(url, root)
# torchvision.datasets.utils.extract_archive(inpath)

In [None]:
def iterate_commonvoice(root, tsv):
    tsv = os.path.join(root, tsv)
    with open(tsv) as tsv:
        
        header = next(tsv)
        header = header.strip().split("\t")
        
        for line in tsv:
            line = line.strip().split("\t")
            
            yield root, header, line

def load_commonvoice(line_generator):
        for root, header, line in line_generator:
            output = dict(zip(header, line))
            
            # filename = line[1]
            filename = output["path"]
            filename = os.path.join(root, "clips", filename)
            waveform, sample_rate = torchaudio.load(filename)
            
            output["waveform"] = waveform
            output["sample_rate"] = sample_rate

            # client_id, path, sentence, up_votes, down_votes, age, gender, accent, waveform, sample_rate
            # line.extend((waveform, sample_rate))
            # yield line
            
            yield output

g = load_commonvoice(iterate_commonvoice(dset_abs_path, tsv))

In [None]:
next(g)