In [1]:
%matplotlib inline
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.fftpack import dct
import matplotlib.pyplot as plt
from python_speech_features.base import mfcc
from seq2seq_model import seq2seq_model
import scipy.io.wavfile

In [2]:
root = 'gs://wsj-data/wsj0/'
keep_prob = 0.8
max_output_length = 100
rnn_size = 256
num_layers = 2
batch_size = 16
learning_rate = 0.0005

learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training

checkpoint = "gs://wsj-data/best_model.ckpt" 

In [3]:
vocab = np.asarray(list(" '+-.ABCDEFGHIJKLMNOPQRSTUVWXYZ_"))
vocab_to_int = {}

for ch in vocab:
    vocab_to_int[ch] = len(vocab_to_int)

vocab_to_int['<GO>'] = len(vocab_to_int)
vocab_to_int['<EOS>'] = len(vocab_to_int)


def onehot(x):
    x = np.asarray(x)
    return np.tile(x, (32, 1)).T == vocab # 32 = vocab length

In [4]:
# A custom class inheriting tf.gfile.Open for providing seek with whence
class FileOpen(tf.gfile.Open):
    def seek(self, position, whence = 0):
        if (whence == 0):
            tf.gfile.Open.seek(self, position)
        elif (whence == 1):
            tf.gfile.Open.seek(self, self.tell() + position)
        else:
            raise FileError

In [5]:
# https://github.com/zszyellow/WER-in-python/blob/master/wer.py
def wer(r, h):
    """
    This is a function that calculate the word error rate in ASR.
    You can use it like this: wer("what is it".split(), "what is".split()) 
    """
    #build the matrix
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0: d[0][j] = j
            elif j == 0: d[i][0] = i
    for i in range(1,len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return float(d[len(r)][len(h)]) / len(r) * 100

In [6]:
out_file = tf.gfile.Open(root + 'transcripts/wsj0/wsj0.trans')
numcep = 13

def get_next_input():
    trans = out_file.readline()
    cont, file = trans.split('(')
    file = file[:-2]
    sample_rate, signal = scipy.io.wavfile.read(FileOpen(root + 'wav/' + file.rstrip('\n'), 'rb'))
    Y = onehot(list(cont))
    X = mfcc(signal, sample_rate, numcep=numcep)
    return X, Y

In [7]:
# Make a graph and it's session
train_graph = tf.Graph()
train_session = tf.InteractiveSession(graph=train_graph)

# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    model_input = tf.placeholder(tf.float32, [None, None, numcep], name='model_input')
    model_output = tf.placeholder(tf.int32, [None, None], name='model_output')
    input_lengths = tf.placeholder(tf.int32, [None], name='input_lengths')
    output_lengths = tf.placeholder(tf.int32, [None], name='output_lengths')
    learning_rate_tensor = tf.placeholder(tf.float32, name='learning_rate')
    
    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(input_data=model_input,
                                                      target_data=model_output,
                                                      keep_prob=keep_prob,
                                                      input_lengths=input_lengths,
                                                      output_lengths=output_lengths,
                                                      max_output_length=max_output_length,
                                                      vocab_size=len(vocab_to_int),
                                                      rnn_size=rnn_size,
                                                      num_layers=num_layers,
                                                      vocab_to_int=vocab_to_int,
                                                      batch_size=batch_size)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(output_lengths, max_output_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            model_output,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate_tensor)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [8]:
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(1, epochs+1):
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries_short, sorted_texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {model_input: input_batch,
                 model_output: output_batch,
                 learning_rate_tensor: learning_rate,
                 output_lengths: output_lengths_batch,
                 input_lengths: input_lengths_batch,
                 keep_prob: keep_probability})

            batch_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time
            

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate

NameError: name 'sorted_texts_short' is not defined