In [8]:
import os 
from gnutools.fs import listfiles, name, parent
from asr_deepspeech.audio import duration, fq
from tqdm import tqdm
from pprint import pprint
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

In [9]:
root = "/srv/sync/cdata/ASR/en/CLEAN/raw/GOOGLE"
files = [f for f in listfiles(root, [".wav"]) if f.endswith(".wav")]

In [32]:
class GoogleDataset():
    def __init__(self, files, fq=16000):
        self._files = files
        self._fq = fq
        self._entries = self.entries()
    
    def entries(self):
        d = dict([self.entry(f) for f in tqdm(self._files, total=len(self._files))])
        return pd.DataFrame.from_dict(d).transpose()
            
    def entry(self, file):
        _name_file = name(file)
        _duration = duration(file)
        _fq = fq(file)
        _text = name(parent(file))
        assert _fq == self._fq
        d = {
            "audio_filepath": file,
            "duration": _duration,
            "fq": _fq,
            "text": _text
        }
        return (_name_file,  d)
    
    def filter_duration(self, start=1, stop=5, to_dict=False):
        df =  self._entries[(self._entries["duration"]>=start) & (self._entries["duration"]<=stop)]
        return df.transpose().to_dict() if to_dict else df
    

In [33]:
dataset = GoogleDataset(files)

100%|██████████| 7478/7478 [00:00<00:00, 12971.69it/s]


In [34]:
import json
json.dump(dataset.filter_duration(1, 2, to_dict=True), open("train.json", "w"), indent=4)