Other examples of dataset:
* [torchvision](https://github.com/pytorch/vision/blob/master/torchvision/datasets/mnist.py)

In [3]:
import torch
import torchvision 
import torchaudio

import os

# yesno

In [283]:
# yesno
# filename = "waves_yesno.tar.gz"
# url = "http://www.openslr.org/resources/1/" + filename

url = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
filename = os.path.basename(url)
root = "/Users/vincentqb/yesnotest/"

inpath = root + filename
dset_abs_path = root + "waves_yesno/"

In [3]:
torchvision.datasets.utils.download_url(url, root)

# torchvision.datasets.utils.extract_archive(inpath, root)  # FIXME no extraction with root
torchvision.datasets.utils.extract_archive(inpath)

Using downloaded and verified file: /Users/vincentqb/yesnotest/waves_yesno.tar.gz


In [292]:
def iterate_yesno(root):
    for f in os.listdir(root):
        if ".wav" in f:
            yield os.path.join(root, f)

def load_yesno(file_generator):
    for file in file_generator:

        waveform, sample_rate = torchaudio.load(file)
        label = os.path.basename(file).split(".", 1)[0].split("_")
    
        yield label, waveform, sample_rate

g = load_yesno(iterate_yesno(dset_abs_path))

In [293]:
len(list(g))

60

# vctk

In [298]:
filename1 = "DS_10283_2651.zip"
filename2 = "VCTK-Corpus.zip"
url = "http://datashare.is.ed.ac.uk/download/" + filename
root = "/Users/vincentqb/vctktest/"

inpath1 = root + filename1
inpath2 = root + filename2
dset_abs_path = root + "VCTK-Corpus/"

In [258]:
# torchvision.datasets.utils.download_url(url, root)

# torchvision.datasets.utils.extract_archive(inpath, root)  # FIXME no extraction with root
torchvision.datasets.utils.extract_archive(inpath1)
torchvision.datasets.utils.extract_archive(inpath2)

In [301]:
def iterate_vctk(root):
    folder_txt = os.path.join(root, "txt/")
    folder_wav = os.path.join(root, "wav48/")
    
    for dp, _, fn in os.walk(folder_txt):
        for f in fn:
            if ".txt" in f:
                file_txt = os.path.join(dp, f)
                base = os.path.basename(file_txt).split(".", 1)[0]
                folder = base.split("_")[0]
                file_audio = os.path.join(folder_wav, folder, base) + ".wav"
            
                yield base, file_txt, file_audio


def load_vctk(file_generator):
    for base, file_txt, file_audio in file_generator:
            
        with open(file_txt) as file_txt:
            content = file_txt.readlines()[0]

        waveform, sample_rate = torchaudio.load(file_audio)

        yield base, content, waveform, sample_rate


g = load_vctk(iterate_vctk(dset_abs_path))

In [309]:
next(g)

('p304_153',
 'You need a trademark.',
 tensor([[0.0001, 0.0005, 0.0005,  ..., 0.0037, 0.0040, 0.0027]]),
 48000)

# LibriSpeech

In [180]:
# http://www.openslr.org/12
# http://www.openslr.org/resources/12/dev-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-other.tar.gz
# http://www.openslr.org/resources/12/train-clean-100.tar.gz
# http://www.openslr.org/resources/12/train-clean-360.tar.gz
# http://www.openslr.org/resources/12/train-other-500.tar.gz

import random
from functools import reduce


def compose(*funcs):
    return lambda x: reduce(lambda f, g: g(f), list(funcs), x)


class LibriSpeech(object):

    url_base = "http://www.openslr.org/resources/12/"
    url_extension = ".tar.gz"
    audio_extension = ".flac"
    text_extension = ".trans.txt"

    _selection = [
        "dev-clean",
        "test-clean",
        "test-other",
        "train-clean-100",
        "train-clean-360",
        "train-other-500"
    ]
    
    in_archive_folder = "LibriSpeech"

    def __init__(self, selection, root_path, extracted=True, shuffle=False):

        if selection not in self._selection:
            raise ValueError

        self.root_path = root_path
        self.selection = selection
        self.extracted = extracted

        self.url = self.url_base + self.selection + self.url_extension
        
        if shuffle:
            c = compose(self.download, self.extract, self.walk, self.shuffle, self.load)
        else:
            c = compose(self.download, self.extract, self.walk, self.load)

        # Initialize here or in __iter__
        self._iterator = c([selection])

    def __iter__(self):
        return self

    def __next__(self):
        return self._iterator.__next__()
        
    def download(self, selections):
        for selection in selections:
            url = os.path.join(self.url_base, selection + self.url_extension)
            # torchvision.datasets.utils.download_url(url, self.root_path)
            yield selection
    
    def extract(self, selections):
        for selection in selections:
            archive_path = os.path.join(self.root_path, selection + self.url_extension)
            # torchvision.datasets.utils.extract_archive(archive_path)
            yield os.path.join(self.root_path, self.in_archive_folder, selection)

    def walk(self, paths):
        for path in paths:
            for dp, dn, fn in os.walk(path):
                for f in fn:
                    if self.audio_extension in f:
                        yield path, os.path.basename(f).split(".")[0]
    
    @staticmethod
    def shuffle(generator):
        generator = list(generator)
        random.shuffle(generator)
        for g in generator:
            yield g
        
    def load(self, fileids):
        for data_path, fileid in fileids:
            folder1, folder2, file = fileid.split("-")
            file_text = folder1 + "-" + folder2 + self.text_extension
            file_text = os.path.join(data_path, folder1, folder2, file_text)
            file_audio = folder1 + "-"+ folder2 + "-" + file + self.audio_extension
            filase_audio = os.path.join(data_path, folder1, folder2, file_audio)
            waveform, sample_rate = torchaudio.load(file_audio)
        
            found = False
            for line in open(file_text):
                fileid_text, content = line.strip().split(" ", 1)
                if fileid == fileid_text:
                    found = True
                    break
            if not found:
                from warnings import warn
                warn("Audio file without translation: {}.".format(fileid))
                continue

            yield fileid, waveform, sample_rate, content


ls = LibriSpeech("dev-clean", "/Users/vincentqb/librispeechtest/")
next(ls) 

('2412-153954-0019',
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.0518e-05, 0.0000e+00,
          0.0000e+00]]),
 16000,
 'BUT BY AND BY THEY CAME TO MY WATCH WHICH I HAD HIDDEN AWAY IN THE INMOST POCKET THAT I HAD AND HAD FORGOTTEN WHEN THEY BEGAN THEIR SEARCH')

In [103]:
import pickle

class Cache:
    def __init__(self, generator, location):
        self.generator = generator
        self.location = location

        self._id = id(self)
        self._cache = []
        self._internal_index = 0

    def __iter__(self):
        self._internal_index = 0
        return self

    def __next__(self):
        if self._internal_index < len(self):
            item = self[self._internal_index]
        else:
            item = next(self.generator)
        
            file = str(self._id) + "-" + str(len(self))
            file = os.path.join(self.location, file)
            self._cache.append(file)
        
            os.makedirs(self.location, exist_ok=True)
            with open(file, 'wb') as file:
                pickle.dump(item, file)

        self._internal_index += 1
        return item
    
    def __getitem__(self, index):
        file = self._cache[index]
        with open(file, 'rb') as file:
            item = pickle.load(file)
        return item
    
    def __len__(self):
        return len(self._cache)

In [108]:
def gen():
    for i in range(0,2):
        yield i
        
cache = Cache(gen(), "tmp/")

In [106]:
for c in cache:
    print(c)

0
1


In [110]:
bool(cache)

True

In [101]:
len(cache)

2

# librispeech

In [4]:
# http://www.openslr.org/12
# http://www.openslr.org/resources/12/dev-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-clean.tar.gz
# http://www.openslr.org/resources/12/test-other.tar.gz
# http://www.openslr.org/resources/12/train-clean-100.tar.gz
# http://www.openslr.org/resources/12/train-clean-360.tar.gz
# http://www.openslr.org/resources/12/train-other-500.tar.gz


url_base = "http://www.openslr.org/resources/12/"
url_extension = ".tar.gz"

selection = [
    "dev-clean",
    "test-clean",
    "test-clean",
    "test-other",
    "train-clean-100",
    "train-clean-360",
    "train-other-500"
]

url = url_base + selection[0] + url_extension
root = "/Users/vincentqb/librispeechtest/"

filename = os.path.basename(url)
inpath = root + filename

base = filename.split(".")[0]
dset_abs_path = root + "LibriSpeech/" + base + "/"

In [163]:
# torchvision.datasets.utils.download_url(url, root)
# torchvision.datasets.utils.extract_archive(inpath)

In [333]:
def iterate_librispeech(root):
    for dp, dn, fn in os.walk(root):
        for f in fn:
            if ".txt" not in f:
                continue
        
            fp = os.path.join(dp, f)
            for line in open(fp):
                fileid, content = line.strip().split(" ", 1)
                audio = os.path.join(dp, fileid) + ".flac"
                waveform, sample_rate = torchaudio.load(audio)
                yield fileid, waveform, sample_rate, content

g = iterate_librispeech(dset_abs_path)

In [18]:
def iterate_librispeech(root):
    for dp, dn, fn in os.walk(root):
        for f in fn:
            if ".txt" not in f:
                continue
        
            fp = os.path.join(dp, f)
            for line in open(fp):
                fileid, _ = line.strip().split(" ", 1)
                yield root, fileid

def load_librispeech(file_generator):
    for root, fileid in file_generator:
        folder1, folder2, file = fileid.split("-")
        file_text = os.path.join(root, folder1, folder2, folder1) + "-" + folder2 + ".trans.txt"
        file_audio = os.path.join(root, folder1, folder2, folder1) + "-" + folder2 + "-" + file + ".flac"
        waveform, sample_rate = torchaudio.load(file_audio)
        
        found = False
        for line in open(file_text):
            fileid_text, content = line.strip().split(" ", 1)
            if fileid == fileid_text:
                found = True
                break
        if not found:
            raise Error

        yield fileid, waveform, sample_rate, content

g = load_librispeech(iterate_librispeech(dset_abs_path))

In [21]:
next(g)

('2412-153954-0002',
 tensor([[-1.5259e-04, -2.1362e-04, -2.1362e-04,  ..., -3.0518e-05,
           0.0000e+00, -3.0518e-05]]),
 16000,
 'THE STREETS WERE NARROW AND UNPAVED BUT VERY FAIRLY CLEAN')

# CommonVoice

In [311]:
web = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/"

languages = {
    "tatar": "tt",
    "english": "en",
    "german": "de",
    "french": "fr",
    "welsh": "cy",
    "breton": "br",
    "chuvash": "cv",
    "turkish": "tr",
    "kyrgyz": "ky",
    "irish": "ga-IE",
    "kabyle": "kab",
    "catalan": "ca",
    "taiwanese": "zh-TW",
    "slovenian": "sl",
    "italian": "it",
    "dutch": "nl",
    "hakha chin": "cnh",
    "esperanto": "eo",
    "estonian": "et",
    "persian": "fa",
    "basque": "eu",
    "spanish": "es",
    "chinese": "zh-CN",
    "mongolian": "mn",
    "sakha": "sah",
    "dhivehi": "dv",
    "kinyarwanda": "rw",
    "swedish": "sv-SE",
    "russian": "ru",
}

url = web + languages["tatar"] + ".tar.gz"
root = "/Users/vincentqb/commonvoicetest/"

filename = os.path.basename(url)
inpath = root + filename

base = filename.split(".")[0]
dset_abs_path = root
tsv = "train.tsv"

In [210]:
# torchvision.datasets.utils.download_url(url, root)
# torchvision.datasets.utils.extract_archive(inpath)

In [330]:
def iterate_commonvoice(root, tsv):
    tsv = os.path.join(root, tsv)
    with open(tsv) as tsv:
        
        header = next(tsv)
        header = header.strip().split("\t")
        
        for line in tsv:
            line = line.strip().split("\t")
            
            yield root, header, line

def load_commonvoice(line_generator):
        for root, header, line in line_generator:
            output = dict(zip(header, line))
            
            # filename = line[1]
            filename = output["path"]
            filename = os.path.join(root, "clips", filename)
            waveform, sample_rate = torchaudio.load(filename)
            
            output["waveform"] = waveform
            output["sample_rate"] = sample_rate

            # client_id, path, sentence, up_votes, down_votes, age, gender, accent, waveform, sample_rate
            # line.extend((waveform, sample_rate))
            # yield line
            
            yield output

g = load_commonvoice(iterate_commonvoice(dset_abs_path, tsv))

In [331]:
next(g)

{'client_id': '11d5e99f7bd5b4f8492a06bb1ec22aa9110bba6ea9918f2a9adec05d686304d568ab7063daf8915d3fccfb4dd44b81646bd13a33ca130ac4014560bba4c2db0b',
 'path': 'common_voice_tt_17343438.mp3',
 'sentence': 'Баш өсте, хан хәзрәтләре.',
 'up_votes': '2',
 'down_votes': '0',
 'age': 'thirties',
 'gender': 'male',
 'waveform': tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -5.3644e-06,
          -1.7546e-06,  2.0936e-06]]),
 'sample_rate': 48000}