In [1]:
import os 
from gnutools.fs import listfiles, name, parent
from asr_deepspeech.audio import duration, fq
from tqdm import tqdm
from pprint import pprint
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from pprint import pprint

In [2]:
root = "/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/jsut_ver1.1"
files = [f for f in listfiles(root, [".wav"]) if f.endswith(".wav")]

In [3]:
class JSUTDataset():
    def __init__(self, files, dictionary, fq=16000):
        self._files = files
        self._fq = fq
        self._dictionary = dictionary
        self._entries = self.entries()
    
    def entries(self):
        d = dict([self.entry(f) for f in tqdm(self._files, total=len(self._files))])
        return pd.DataFrame.from_dict(d).transpose()
            
    def entry(self, file):
        _name_file = name(file)
        _duration = duration(file)
        _fq = fq(file)
        _text = self._dictionary[_name_file]
        assert _fq == self._fq
        d = {
            "audio_filepath": file,
            "duration": _duration,
            "fq": _fq,
            "text": _text
        }
        return (_name_file,  d)
    
    def filter_duration(self, start=1, stop=5, blacklist=[]):
        _entries = self._entries
        entries = _entries.transpose().to_dict()
        entries =  dict([(k, v) for k, v in entries.items() if (v["duration"]>=start and v["duration"]<=stop and not k in blacklist)])
        df = pd.DataFrame.from_dict(entries)
        return df.transpose()

In [4]:
root_transcripts = "/srv/nfs/cdata/ASR/ja/raw/CLEAN/JSUT/jsut_ver1.1"
transcript_files = listfiles(root_transcripts, ["transcript_utf8.txt"])
transcripts = {}
def filter_text(t):
    k, v = t
    v = v.replace(" ", "").replace("\n", "").replace("、", "").replace("。", "")
    return (k, v)
for file in transcript_files:
    _transcripts = dict([filter_text(l.split(":")) for l in open(file)])
    transcripts.update(_transcripts)
dataset = JSUTDataset(files, transcripts)

100%|██████████| 7696/7696 [00:00<00:00, 13892.12it/s]


In [5]:
dataset._entries

Unnamed: 0,audio_filepath,duration,fq,text
UT-PARAPHRASE-sent055-phrase1,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,2.15,16000,今はせわしなく世知辛い
UT-PARAPHRASE-sent196-phrase1,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,5.84,16000,練り方が足りないと出来上がりがぱさつき口に入れるとポロポロと崩れる感じになる
UT-PARAPHRASE-sent175-phrase1,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,4.11,16000,手書きの作業が不要になり記帳の間違いもなくなる
UT-PARAPHRASE-sent226-phrase2,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,4.23,16000,しかし秘書には決して取り合うなと指示していた
UT-PARAPHRASE-sent274-phrase1,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,3.71,16000,より手厚い看護ができるようになるのは望ましいことだ
...,...,...,...,...
BASIC5000_4332,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,7.08775,16000,勝手な行動を続けたショウと仲たがいすることになるが結局神狗との銃撃戦で殉職
BASIC5000_4033,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,3.75,16000,この様子を見てつるぎは夜露との取引に乗った
BASIC5000_4903,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,3.1,16000,彼女はいつも床を綺麗に掃いています
BASIC5000_0930,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,5,16000,土日月の午前１０時半から午後４時まで開館


In [6]:
import json
import random
from sklearn.model_selection import train_test_split

d = dataset.filter_duration(6, 10)
d.head()
dtrain, dtest = train_test_split(d, test_size=0.1)
print(len(dtrain.transpose().to_dict()))
print(len(dtest.transpose().to_dict()))
json.dump(dtrain.transpose().to_dict()  , open("jsut_ids_6_10-train.json", "w"), indent=4, ensure_ascii=False)
json.dump(dtest.transpose().to_dict()  , open("jsut_ids_6_10-val.json", "w"), indent=4, ensure_ascii=False)

1275
142


In [7]:
dtest.head()

Unnamed: 0,audio_filepath,duration,fq,text
BASIC5000_3246,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,7.99,16000,ガス風船とは一般には浮揚性のある水素ガスやヘリウムガスを注入したゴム風船やマイラーバルーンのこと
REPEAT500_set2_072,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,6.32931,16000,ブラッグはビューエル軍よりも劣勢だったためにこの機会を生かすことを躊躇した
BASIC5000_4347,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,6.55931,16000,衰えると肉体を炎と化して灰の中から雛として生まれ変わることが出来る
BASIC5000_3786,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,7.81,16000,建築の分野では周に入ってからそれまでの茅葺きから瓦が一般的になったことがわかっている
BASIC5000_3776,/srv/sync/cdata/ASR/ja/processed/CLEAN/JSUT/js...,7.39,16000,アイゼンをつけると滑りにくくなるが雪が軟らかい場合アイゼン装着でキックステップを使う場合もある


In [15]:
import json
import random
from sklearn.model_selection import train_test_split

val = "/srv/sync/cpj/CADIC/cadic-asr-deepspeech/__data__/manifests/jsut_ids_4_5-val.json"
blacklist = list(json.load(open(val, "r")).keys())
dtrain = dataset.filter_duration(10, 15, blacklist=blacklist)
dtrain.head()
# dtrain, dtest = train_test_split(d, test_size=0)
print(len(dtrain.transpose().to_dict()))
# print(len(dtest.transpose().to_dict()))
# json.dump(dtrain.transpose().to_dict()  , open("jsut_ids_2_4-train.json", "w"), indent=4, ensure_ascii=False)
# json.dump(dtest.transpose().to_dict()  , open("jsut_ids_6_10-val.json", "w"), indent=4, ensure_ascii=False)

277


In [13]:
len(blacklist)

147