In [1]:
import os
import argparse
import time
import pickle
from random import shuffle
from tqdm import tqdm

import numpy as np
import tensorflow as tf

from utils import text_process_aihub as aihub_text
from utils.transform import transform_mfcc_from_file

In [2]:
def get_file_list(path, audio_ext='.pcm', trans_ext='.txt'):
    if not os.path.exists(path):
        return []
    
    file_list = []
    for _path, _dir, _files in os.walk(path):
        for f in _files:
            if f[-len(audio_ext):] == audio_ext:
                f_name = os.path.join(_path, f[:-len(audio_ext)])
                if os.path.exists(f_name + trans_ext):
                    file_list.append(f_name)
    file_list.sort()
    
    return file_list

In [3]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [4]:
data_dir = "data/KsponSpeech_sample/"
audio_ext = '.pcm'
trans_ext = '.txt'

file_list = get_file_list(data_dir, audio_ext, trans_ext)
file_list[:2]

['data/KsponSpeech_sample/KsponSpeech_000001',
 'data/KsponSpeech_sample/KsponSpeech_000002']

In [5]:
rule_in, rule_out = aihub_text.g2p.readRules(aihub_text.ver_info[0], './g2p/rulebook.txt')
df_korSym = aihub_text.get_korean_symbol_dataframe()

In [6]:
text, prons, labels = aihub_text.get_prons_and_labels_from_file(file_list[0], rule_in, rule_out, df_korSym, 'euc-kr', True, True)

In [7]:
feature = transform_mfcc_from_file(file_list[0] + audio_ext, \
                                       endian='int16', sr=16000)

In [8]:
print(file_list[0])
print(text)
print(prons)
print(labels)

data/KsponSpeech_sample/KsponSpeech_000001
아/ 몬 소리야, 그건 또. b/
|aa [GANTU] |mm oo nf |s0 oo rr ii ya |k0 xx k0 vv nf |tt oo | [NOISE:b]
[8, 57, 3, 0, 24, 61, 40, 0, 18, 61, 26, 54, 64, 0, 15, 58, 15, 59, 40, 0, 14, 61, 0, 4, 8]


## Save dataset as TFRecords

In [9]:
options = tf.python_io.TFRecordOptions(compression_type=tf.python_io.TFRecordCompressionType.GZIP)
writer = tf.python_io.TFRecordWriter(path="./tfrecord/tfrecord_practice.tfrecords", options=options)

In [10]:
print("Start converting...")
for f in tqdm(file_list):
    feat = transform_mfcc_from_file(f + audio_ext, \
                                       endian='int16', sr=16000)
    text, prons, labels = aihub_text.get_prons_and_labels_from_file(file_list[0], rule_in, rule_out, df_korSym, 'euc-kr', True, True)
    
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "file_name": _bytes_feature(f.encode('utf-8')),
                "feature": _bytes_feature(feat.tostring()),
                "trans": _bytes_feature(text.encode('utf-8')),
                "prons": _bytes_feature(prons.encode('utf-8')),
                "label": _bytes_feature(np.array(labels).tostring())
            }
        )
    )
    print(f, feat.shape)
    writer.write(example.SerializeToString())
    
writer.close()
print("Done...")
    

  2%|▏         | 2/100 [00:00<00:05, 16.86it/s]

Start converting...
data/KsponSpeech_sample/KsponSpeech_000001 (314, 39)
data/KsponSpeech_sample/KsponSpeech_000002 (1058, 39)
data/KsponSpeech_sample/KsponSpeech_000003 (1231, 39)


  7%|▋         | 7/100 [00:00<00:05, 17.65it/s]

data/KsponSpeech_sample/KsponSpeech_000004 (912, 39)
data/KsponSpeech_sample/KsponSpeech_000005 (506, 39)
data/KsponSpeech_sample/KsponSpeech_000006 (385, 39)
data/KsponSpeech_sample/KsponSpeech_000007 (1208, 39)
data/KsponSpeech_sample/KsponSpeech_000008 (128, 39)


 14%|█▍        | 14/100 [00:00<00:03, 22.48it/s]

data/KsponSpeech_sample/KsponSpeech_000009 (632, 39)
data/KsponSpeech_sample/KsponSpeech_000010 (305, 39)
data/KsponSpeech_sample/KsponSpeech_000011 (354, 39)
data/KsponSpeech_sample/KsponSpeech_000012 (586, 39)
data/KsponSpeech_sample/KsponSpeech_000013 (284, 39)
data/KsponSpeech_sample/KsponSpeech_000014 (458, 39)
data/KsponSpeech_sample/KsponSpeech_000015 (647, 39)


 20%|██        | 20/100 [00:00<00:03, 23.60it/s]

data/KsponSpeech_sample/KsponSpeech_000016 (278, 39)
data/KsponSpeech_sample/KsponSpeech_000017 (370, 39)
data/KsponSpeech_sample/KsponSpeech_000018 (336, 39)
data/KsponSpeech_sample/KsponSpeech_000019 (492, 39)
data/KsponSpeech_sample/KsponSpeech_000020 (828, 39)
data/KsponSpeech_sample/KsponSpeech_000021 (113, 39)


 27%|██▋       | 27/100 [00:01<00:02, 25.48it/s]

data/KsponSpeech_sample/KsponSpeech_000022 (516, 39)
data/KsponSpeech_sample/KsponSpeech_000023 (926, 39)
data/KsponSpeech_sample/KsponSpeech_000024 (772, 39)
data/KsponSpeech_sample/KsponSpeech_000025 (216, 39)
data/KsponSpeech_sample/KsponSpeech_000026 (159, 39)
data/KsponSpeech_sample/KsponSpeech_000027 (480, 39)


 30%|███       | 30/100 [00:01<00:02, 24.46it/s]

data/KsponSpeech_sample/KsponSpeech_000028 (270, 39)
data/KsponSpeech_sample/KsponSpeech_000029 (112, 39)
data/KsponSpeech_sample/KsponSpeech_000030 (1496, 39)
data/KsponSpeech_sample/KsponSpeech_000031 (485, 39)
data/KsponSpeech_sample/KsponSpeech_000032 (128, 39)


 38%|███▊      | 38/100 [00:01<00:02, 28.16it/s]

data/KsponSpeech_sample/KsponSpeech_000033 (728, 39)
data/KsponSpeech_sample/KsponSpeech_000034 (433, 39)
data/KsponSpeech_sample/KsponSpeech_000035 (269, 39)
data/KsponSpeech_sample/KsponSpeech_000036 (127, 39)
data/KsponSpeech_sample/KsponSpeech_000037 (882, 39)
data/KsponSpeech_sample/KsponSpeech_000038 (173, 39)


 41%|████      | 41/100 [00:01<00:02, 27.11it/s]

data/KsponSpeech_sample/KsponSpeech_000039 (955, 39)
data/KsponSpeech_sample/KsponSpeech_000040 (399, 39)
data/KsponSpeech_sample/KsponSpeech_000041 (321, 39)
data/KsponSpeech_sample/KsponSpeech_000042 (137, 39)
data/KsponSpeech_sample/KsponSpeech_000043 (169, 39)
data/KsponSpeech_sample/KsponSpeech_000044 (168, 39)
data/KsponSpeech_sample/KsponSpeech_000045 (161, 39)


 49%|████▉     | 49/100 [00:01<00:01, 25.78it/s]

data/KsponSpeech_sample/KsponSpeech_000046 (805, 39)
data/KsponSpeech_sample/KsponSpeech_000047 (193, 39)
data/KsponSpeech_sample/KsponSpeech_000048 (434, 39)
data/KsponSpeech_sample/KsponSpeech_000049 (993, 39)
data/KsponSpeech_sample/KsponSpeech_000050 (248, 39)
data/KsponSpeech_sample/KsponSpeech_000051

 53%|█████▎    | 53/100 [00:02<00:01, 28.31it/s]

 (258, 39)
data/KsponSpeech_sample/KsponSpeech_000052 (178, 39)
data/KsponSpeech_sample/KsponSpeech_000053 (419, 39)
data/KsponSpeech_sample/KsponSpeech_000054 (211, 39)
data/KsponSpeech_sample/KsponSpeech_000055 (594, 39)
data/KsponSpeech_sample/KsponSpeech_000056 (841, 39)


 59%|█████▉    | 59/100 [00:02<00:02, 19.53it/s]

data/KsponSpeech_sample/KsponSpeech_000057 (202, 39)
data/KsponSpeech_sample/KsponSpeech_000058 (2914, 39)
data/KsponSpeech_sample/KsponSpeech_000059 (225, 39)


 66%|██████▌   | 66/100 [00:02<00:01, 24.39it/s]

data/KsponSpeech_sample/KsponSpeech_000060 (509, 39)
data/KsponSpeech_sample/KsponSpeech_000061 (208, 39)
data/KsponSpeech_sample/KsponSpeech_000062 (429, 39)
data/KsponSpeech_sample/KsponSpeech_000063 (207, 39)
data/KsponSpeech_sample/KsponSpeech_000064 (364, 39)
data/KsponSpeech_sample/KsponSpeech_000065 (662, 39)
data/KsponSpeech_sample/KsponSpeech_000066 (583, 39)


 69%|██████▉   | 69/100 [00:02<00:01, 23.31it/s]

data/KsponSpeech_sample/KsponSpeech_000067 (456, 39)
data/KsponSpeech_sample/KsponSpeech_000068 (1460, 39)
data/KsponSpeech_sample/KsponSpeech_000069 (537, 39)
data/KsponSpeech_sample/KsponSpeech_000070 (769, 39)
data/KsponSpeech_sample/KsponSpeech_000071 (219, 39)


 75%|███████▌  | 75/100 [00:03<00:01, 22.42it/s]

data/KsponSpeech_sample/KsponSpeech_000072 (424, 39)
data/KsponSpeech_sample/KsponSpeech_000073 (371, 39)
data/KsponSpeech_sample/KsponSpeech_000074 (426, 39)
data/KsponSpeech_sample/KsponSpeech_000075 (1706, 39)
data/KsponSpeech_sample/KsponSpeech_000076 (600, 39)


 81%|████████  | 81/100 [00:03<00:00, 24.19it/s]

data/KsponSpeech_sample/KsponSpeech_000077 (160, 39)
data/KsponSpeech_sample/KsponSpeech_000078 (1042, 39)
data/KsponSpeech_sample/KsponSpeech_000079 (235, 39)
data/KsponSpeech_sample/KsponSpeech_000080 (1162, 39)
data/KsponSpeech_sample/KsponSpeech_000081 (316, 39)
data/KsponSpeech_sample/KsponSpeech_000082 (227, 39)


 90%|█████████ | 90/100 [00:03<00:00, 29.32it/s]

data/KsponSpeech_sample/KsponSpeech_000083 (324, 39)
data/KsponSpeech_sample/KsponSpeech_000084 (384, 39)
data/KsponSpeech_sample/KsponSpeech_000085 (575, 39)
data/KsponSpeech_sample/KsponSpeech_000086 (316, 39)
data/KsponSpeech_sample/KsponSpeech_000087 (126, 39)
data/KsponSpeech_sample/KsponSpeech_000088 (612, 39)
data/KsponSpeech_sample/KsponSpeech_000089 (147, 39)
data/KsponSpeech_sample/KsponSpeech_000090 (259, 39)


 95%|█████████▌| 95/100 [00:03<00:00, 32.10it/s]

data/KsponSpeech_sample/KsponSpeech_000091 (262, 39)
data/KsponSpeech_sample/KsponSpeech_000092 (407, 39)
data/KsponSpeech_sample/KsponSpeech_000093 (273, 39)
data/KsponSpeech_sample/KsponSpeech_000094 (264, 39)
data/KsponSpeech_sample/KsponSpeech_000095 (208, 39)
data/KsponSpeech_sample/KsponSpeech_000096 (617, 39)
data/KsponSpeech_sample/KsponSpeech_000097 (672, 39)


100%|██████████| 100/100 [00:03<00:00, 25.86it/s]

data/KsponSpeech_sample/KsponSpeech_000098 (297, 39)
data/KsponSpeech_sample/KsponSpeech_000099 (1276, 39)
data/KsponSpeech_sample/KsponSpeech_001000 (358, 39)
Done...





## Load dataset from TFRecords file

In [72]:
def from_tfrecord(serialized):
    features = tf.parse_single_example(
        serialized=serialized,
#         features={
#             "file_name": tf.VarLenFeature(tf.string),
#             "feature": tf.VarLenFeature(tf.string),
#             "trans": tf.VarLenFeature(tf.string),
#             "prons": tf.VarLenFeature(tf.string),
#             "label": tf.VarLenFeature(tf.string)
#         }
        features={
            "file_name": tf.FixedLenFeature([], tf.string),
            "feature": tf.FixedLenFeature([], tf.string),
            "trans": tf.FixedLenFeature([], tf.string),
            "prons": tf.FixedLenFeature([], tf.string),
            "label": tf.FixedLenFeature([], tf.string)
        }
    )
#     file_name = tf.decode_raw(features['file_name'], tf.string)
    file_name = features['file_name']
    feature = tf.decode_raw(features['feature'], tf.float32)
    #feature = tf.reshape(tf.decode_raw(features['feature'], tf.float32), [-1, 39])
#     trans = tf.decode_raw(features['trans'], tf.string)
    trans = features['trans']
#     prons = tf.decode_raw(features['prons'], tf.string)
    prons = features['prons']
    labels = tf.decode_raw(features['label'], tf.int16)
    
    return file_name, feature, trans, prons, labels

In [73]:
dataset = tf.data.TFRecordDataset(filenames="tfrecord/tfrecord_practice.tfrecords",
                                 compression_type="GZIP").map(from_tfrecord)
g_fname, g_feature, g_trans, g_prons, g_labels = dataset.batch(2).shuffle(10).repeat(10).make_one_shot_iterator().get_next()

In [74]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [75]:
_labels = sess.run([g_labels])

InvalidArgumentError: Cannot batch tensors with different shapes in component 1. First element had shape [24492] and element 1 had shape [82524].
	 [[Node: IteratorGetNext_11 = IteratorGetNext[output_shapes=[[?], [?,?], [?], [?], [?,?]], output_types=[DT_STRING, DT_FLOAT, DT_STRING, DT_STRING, DT_INT16], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_11)]]

Caused by op 'IteratorGetNext_11', defined at:
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 583, in start
    self.io_loop.start()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 149, in start
    self.asyncio_loop.run_forever()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 1426, in _run_once
    handle._run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/events.py", line 127, in _run
    self._callback(*self._args)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2858, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2886, in _run_cell
    return runner(coro)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3063, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3254, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-73-4bb0f19d8678>", line 3, in <module>
    g_fname, g_feature, g_trans, g_prons, g_labels = dataset.batch(2).shuffle(10).repeat(10).make_one_shot_iterator().get_next()
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 410, in get_next
    name=name)), self._output_types,
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2069, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/Users/kimsu/workspace/stt/env/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 1. First element had shape [24492] and element 1 had shape [82524].
	 [[Node: IteratorGetNext_11 = IteratorGetNext[output_shapes=[[?], [?,?], [?], [?], [?,?]], output_types=[DT_STRING, DT_FLOAT, DT_STRING, DT_STRING, DT_INT16], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_11)]]


In [65]:
_labels

[array([[ 8,  0,  0,  0, 57,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,
         24,  0,  0,  0, 61,  0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  0,
         18,  0,  0,  0, 61,  0,  0,  0, 26,  0,  0,  0, 54,  0,  0,  0,
         64,  0,  0,  0,  0,  0,  0,  0, 15,  0,  0,  0, 58,  0,  0,  0,
         15,  0,  0,  0, 59,  0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  0,
         14,  0,  0,  0, 61,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
          8,  0,  0,  0]], dtype=int16)]

In [60]:
np.shape(_labels)

(1, 1, 100)

In [46]:
_label

[array([[ 8,  0,  0,  0, 57,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,
         24,  0,  0,  0, 61,  0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  0,
         18,  0,  0,  0, 61,  0,  0,  0, 26,  0,  0,  0, 54,  0,  0,  0,
         64,  0,  0,  0,  0,  0,  0,  0, 15,  0,  0,  0, 58,  0,  0,  0,
         15,  0,  0,  0, 59,  0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  0,
         14,  0,  0,  0, 61,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
          8,  0,  0,  0]], dtype=int16)]