In [1]:
# environment: tensorflow 1.2

import tensorflow as tf
import numpy as np
import utils
import cells

In [2]:
########## Read data ##########

train_data_dir = "./data/TIMIT/phn/train/mfcc/"
train_label_dir = "./data/TIMIT/phn/train/label/"
test_data_dir = "./data/TIMIT/phn/test/mfcc/"
test_label_dir = "./data/TIMIT/phn/test/label/"

#each one is a list of 2D ([feature_num, time_step]) numpy data
train_data = utils.read_ndarray_from(train_data_dir) 
train_label = utils.read_ndarray_from(train_label_dir)
test_data = utils.read_ndarray_from(test_data_dir)
test_label = utils.read_ndarray_from(test_label_dir)

#---------for debugging model, we only test on a smaller dataset---------
train_data = train_data[0:100]
train_label = train_label[0:100]
test_data = test_data[0:100]
test_label = test_label[0:100] 
#------------------------------------------------------------------------


# Make a glance of data
print("{}'s {}, samples num: {}, each element has {}"\
      .format("train_data", type(train_data), len(train_data), type(train_data[0])))
print("{}'s {}, samples num: {}, each element has {}"\
      .format("train_label", type(train_label), len(train_label), type(train_label[0])))
print("{}'s {}, samples num: {}, each element has {}"\
      .format("test_data", type(test_data), len(test_data), type(test_data[0])))
print("{}'s {}, samples num: {}, each element has {}"\
      .format("test_label", type(test_label), len(test_label), type(test_label[0])))


train_data's <type 'list'>, samples num: 100, each element has <type 'numpy.ndarray'>
train_label's <type 'list'>, samples num: 100, each element has <type 'numpy.ndarray'>
test_data's <type 'list'>, samples num: 100, each element has <type 'numpy.ndarray'>
test_label's <type 'list'>, samples num: 100, each element has <type 'numpy.ndarray'>


In [3]:
########## Define Hyper-parameters ##########
class Argument(object):
    def __init__(self):
        self.max_epoch = 500
        self.num_layers = 2
        self.num_hidden = 256
        self.num_classes = 62 
        self.batch_size = 32
        self.learning_rate = 0.001
        self.layer_norm = True
        self.dropout_prob = 0.1
        self.dropout_keep_prob = 1- self.dropout_prob
        
        self.num_feature = train_data[0].shape[0]
        self.max_timestep = utils.get_max_timestep(train_data, test_data)
        
        self.layer_norm = True
        self.cell_type = 'LSTMCell' #option: LSTMCell, RNNCell, GRUCell
#         self.activation = 'tanh' #option: tanh, ReLU

In [4]:
########## Define a model ##########

args = Argument()
graph = tf.Graph()
with graph.as_default():
    inputs = tf.placeholder(dtype=tf.float32, shape=[args.batch_size, args.max_timestep, args.num_feature], name="inputs")
    targetsIdx = tf.placeholder(tf.int64)
    targetsVal = tf.placeholder(tf.int32)
    targetsShape = tf.placeholder(tf.int64)
    targets = tf.SparseTensor(targetsIdx, targetsVal, targetsShape)
    seq_len = tf.placeholder(tf.int32, [args.batch_size], name="seq_len")
    
    #stack multi-layers cells
    stacked_cells = []
    for i in range(args.num_layers):
        cell = cells.select_cell(args)
        stacked_cells.append(cell)
    mul_cells = tf.contrib.rnn.MultiRNNCell(stacked_cells)
    
    #use dynamic rnn to get output lists and deprecated the last state
    #output shape: [batch_size, time_steps, num_hidden]
    output, _ = tf.nn.dynamic_rnn(mul_cells, inputs, seq_len, dtype=tf.float32)
    #define full connect layer
    logits = tf.layers.dense(output, args.num_classes)
    
    #time major, shape: [time_steps, batch_size, num_hidden]
    logits = tf.transpose(logits, [1, 0, 2])
    
    #optimizer 
    loss = tf.nn.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)
    
    optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(cost)
    
    
    
    predictions = tf.to_int32(
                tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)[0][0])
    



In [5]:
######### 3. Start training     #########

batch_size = args.batch_size
max_epoch = args.max_epoch
num_samples = len(train_data)
level = 'phn'
# data_idx = np.random.permutation(num_samples)
(train_dataBatches, _) = utils.data_lists_to_batches(train_data, train_label, batch_size, level)
train_num_batches = len(train_dataBatches)
train_batchErrors = np.zeros(train_num_batches)

(test_dataBatches, _) = utils.data_lists_to_batches(test_data, test_label, batch_size, level)
test_num_batches = len(test_dataBatches)
test_batchErrors = np.zeros(test_num_batches)

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(max_epoch):
        for i in range(train_num_batches):
            batched_inputs, batched_sparse_targets, batched_seq_len = train_dataBatches[i]
            batched_targets_idx, batched_targets_val, batched_targets_shape = batched_sparse_targets
            feed_dict = {inputs:batched_inputs, targetsIdx:batched_targets_idx, targetsVal:batched_targets_val,\
                         targetsShape:batched_targets_shape, seq_len:batched_seq_len }
            _, cost_val, pre_y, y = sess.run([optimizer, cost, predictions, targets], feed_dict=feed_dict)
            
            er = utils.get_edit_distance([pre_y.values], [y.values], True, level)
            print('\n{} mode, total:{},batch:{}/{},epoch:{}/{},train loss=        for i in range(test_num_batches):
#             test_batched_inputs, test_batched_sparse_targets, test_batched_seq_len = test_dataBatches[i]
#             test_batched_targets_idx, test_batched_targets_val, test_batched_targets_shape = test_batched_sparse_targets
#             feed_dict = {inputs:test_batched_inputs, targetsIdx:test_batched_targets_idx, targetsVal:test_batched_targets_val,\
#                          targetsShape:test_batched_targets_shape,seq_len:test_batched_seq_len}
#             cost_val, pre_y, y = sess.run([cost, predictions, targets], feed_dict=feed_dict)
#             er = utils.get_edit_distance([pre_y.values], [y.values], True, level)
#             print('\n{} mode, batch:{}/{},epoch:{}/{},test loss={:.3f},mean test PER={:.3f}\n'.format(
#                 level , i+1, test_num_batches, epoch+1, max_epoch, cost_val, er))
#             test_batchErrors[i] = er * len(test_batched_seq_len)
            
{:.3f},mean train PER={:.3f}\n'.format(
                level, num_samples, i+1, train_num_batches, epoch+1, max_epoch, cost_val, er))
            train_batchErrors[i] = er * len(batched_seq_len)
            if i % 30 == 0:
                print('Truth:\n' + utils.output_to_sequence(y, type=level))
                print('Output:\n' + utils.output_to_sequence(pre_y, type=level))
        
#         for i in range(test_num_batches):
#             test_batched_inputs, test_batched_sparse_targets, test_batched_seq_len = test_dataBatches[i]
#             test_batched_targets_idx, test_batched_targets_val, test_batched_targets_shape = test_batched_sparse_targets
#             feed_dict = {inputs:test_batched_inputs, targetsIdx:test_batched_targets_idx, targetsVal:test_batched_targets_val,\
#                          targetsShape:test_batched_targets_shape,seq_len:test_batched_seq_len}
#             cost_val, pre_y, y = sess.run([cost, predictions, targets], feed_dict=feed_dict)
#             er = utils.get_edit_distance([pre_y.values], [y.values], True, level)
#             print('\n{} mode, batch:{}/{},epoch:{}/{},test loss={:.3f},mean test PER={:.3f}\n'.format(
#                 level , i+1, test_num_batches, epoch+1, max_epoch, cost_val, er))
#             test_batchErrors[i] = er * len(test_batched_seq_len)
            

        


phn mode, total:100,batch:1/3,epoch:1/500,train loss=1128.488,mean train PER=3.388

Truth:
h# n iy eh ae dx jh y er dx d ae r dx k s jh dx ix ng dx g r iy s iy w ae n dx w ae dx ch dx ae l y el ch h#
Output:
ux ix ax-h epi dx epi ax-h zh g n f ay nx aa nx aa nx eng w r bcl d eng eh eng eh th tcl th m th dcl axr tcl ax-h tcl ow tcl p ax-h p hv p uw hv t ah ow tcl uw tcl h# ux s ao s ao h# ao s er ch dcl ix t pau ix tcl ax-h tcl er tcl k tcl ax-h tcl axr nx axr eh l el l zh p dcl eh q f th er bcl er w s aw ih ay h# ay ax-h h# ax-h k ax-h k pau k ux k ch s ux ix r uh en zh m dx r dx eh dx eh hv q ix ch ix dx ix ao p zh p pau zh pau ao k hh k pcl k dx aa uw n dh d dh ix ax q jh sh jh sh dh ah ix ux ng ax th ay y ay y

phn mode, total:100,batch:2/3,epoch:1/500,train loss=788.384,mean train PER=2.620


phn mode, total:100,batch:3/3,epoch:1/500,train loss=368.575,mean train PER=0.793


phn mode, total:100,batch:1/3,epoch:2/500,train loss=187.955,mean train PER=0.945

Truth:
h# n iy eh ae dx 

In [6]:
phn = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h',\
   'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl',\
   'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng',\
   'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#',\
   'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k',\
   'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow',\
   'oy', 'p', 'pau', 'pcl', 'q', 'r', 's',\
   'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux',\
   'v', 'w', 'y', 'z', 'zh']

mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', \
           'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n',\
           'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#',\
           'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#',\
           'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

group_phn = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', \
             'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', \
             'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw',\
             'v', 'w', 'y', 'z', 'zh']
print(len(phn))
print(len(mapping))
print(len(group_phn))

mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', \
           'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n',\
           'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#',\
           'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#',\
           'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

group_phn = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', \
             'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', \
             'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw',\
             'v', 'w', 'y', 'z', 'zh']

print(len(mapping))
print(len(group_phn))

61
22
39
22
39
