In [1]:
import math
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import tensorflow as tf
np.set_printoptions(threshold=np.nan)

  from ._conv import register_converters as _register_converters


In [2]:
def get_max_phonemes(durations):
    """returns max phoneme length and max number of frames"""
    max_phonemes = 0
    max_frames = 0
    for i in durations:
        max_phonemes = max(len(i),max_phonemes)
        l = list(zip(*i)[1])
        max_temp = 0
        for temp in l:
            max_temp = max(max_temp, temp)
        max_frames =  max(max_temp, max_frames)
    return max_frames, max_phonemes

In [3]:
"""Set of hyperparameters"""
with open("duration_pred.pkl",'r') as f:
    durations,speaker_dur = pickle.load(f)
    
max_phoneme_frames, max_num_phonemes = get_max_phonemes(durations)   #get from data... Setting to 10 for example only
batch_size = 32
speak_feats = 16 #features in speaker embedding
phoneme_feats = 16

num_time_buckets = 30 

phoneme_count = 40   #take form list

num_speakers = 108  #Get from file 

mlp_layers = 4 #number of layers in MLP
mlp_units = 256 #number of units in each MLP layer

num_rnn_layers = 4  #number of layers in RNN network
num_hidden_gru = 512 #number of hidden units in each RNN layer

with open("max_frames_speakers_count.pkl") as f:
    max_frames, speakers, count = pickle.load(f)

#phonemes
silence = "SIL"

phonemes = [silence,'AA','AE','AH','AO','AW','AY','B','CH','D','DH','EH',
            'ER','EY','F','G','HH','IH','IY','JH','K','L','M','N','NG','OW','OY',
            'P','R','S','SH','T','TH','UH','UW','V','W','Y','Z','ZH']

phoneme_dict = dict()


In [4]:
x = tf.placeholder(tf.float32, [batch_size, max_num_phonemes,phoneme_count]) #Phonemes whose duration is to be predicted, one phoeneme at a time

y_inp = tf.placeholder(tf.int32, [batch_size,max_num_phonemes])  #ground truth BUCKETED durations of phonemes obtained 
                                                                   #from segmentation model. Stored as indices from
                                                                   #0 to num_time_buckets - 1

y_length = tf.placeholder(tf.int32, [batch_size]) #stores length of sequence of phonemes for the batch
speaker_selector = tf.placeholder(tf.float32, [batch_size, num_speakers]) #One hot speaker selector matrix

In [5]:
"""16 feature speaker embedding for better distinction in speakers"""
with tf.variable_scope("speaker_embedding"):
    speaker = tf.get_variable("speaker",
                              shape=(num_speakers,speak_feats),
                              dtype= tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer(),
                              trainable=True)
    speaker_matrix = tf.matmul(speaker_selector, speaker)
speaker_matrix

<tf.Tensor 'speaker_embedding/MatMul:0' shape=(32, 16) dtype=float32>

In [6]:
"""16 feature phoneme embedding for better distinction in phonemes"""
with tf.variable_scope("phoneme_embedding"):
    phoneme_emb = tf.get_variable("phoneme_emb",
                              shape=(len(phonemes),phoneme_feats),
                              dtype= tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer(),
                              trainable=True)
    unstacked_x = tf.reshape(x, [-1,len(phonemes)])
    phoneme_array = tf.matmul(unstacked_x, phoneme_emb)
    phoneme_matrix = tf.reshape(phoneme_array,[batch_size, max_num_phonemes,phoneme_feats])
phoneme_matrix

<tf.Tensor 'phoneme_embedding/Reshape_1:0' shape=(32, 106, 16) dtype=float32>

In [7]:
"""fully connected layer that takes speaker as the input and sends state to phonemes before MLP"""
with tf.variable_scope("MLP_speaker"):
    mlp_speaker_outputs = phoneme_feats
    mlp_speaker = tf.contrib.layers.fully_connected(speaker_matrix,
                                                    mlp_speaker_outputs,
                                                    activation_fn= None)
mlp_speaker

<tf.Tensor 'MLP_speaker/fully_connected/BiasAdd:0' shape=(32, 16) dtype=float32>

In [8]:
"""appending phonemes to speaker embedding."""
with tf.variable_scope("MLP_preprocessing"):
    speaker_tiled = tf.contrib.seq2seq.tile_batch(tf.expand_dims(mlp_speaker,0), multiplier = max_num_phonemes)
    speaker_tiled = tf.transpose(speaker_tiled, [1,0,2])
    mlp_input = tf.concat([speaker_tiled,phoneme_matrix],axis=2)
    
print mlp_input

Tensor("MLP_preprocessing/concat:0", shape=(32, 106, 32), dtype=float32)


In [9]:
"""creating the MLP network which takes phoneme sequence appended to speaker embedding as input"""
with tf.variable_scope("MLP"):
    mlp_output = []
    MLP = [mlp_input,]  #index 0 is the input, index 1 to n correspond to layer 1 to n respectively
    for i in range(mlp_layers):
        layer = tf.contrib.layers.fully_connected(MLP[i],
                                                  mlp_units,
                                                  activation_fn=None)  ## which activation to use??? Default = relu
                                                                             ## I have used tanh for now.
        MLP.append(layer)
    mlp_output = MLP[mlp_layers]

mlp_output

<tf.Tensor 'MLP/fully_connected_3/BiasAdd:0' shape=(32, 106, 256) dtype=float32>

In [10]:
"""fully connected layer that takes speaker as the input and sends state to the Bidirectional Gru layers"""
with tf.variable_scope("rnn_speaker"):
    rnn_speaker_outputs = num_hidden_gru
    rnn_speaker = tf.contrib.layers.fully_connected(speaker_matrix,
                                                    rnn_speaker_outputs,
                                                    activation_fn= None)
rnn_speaker

<tf.Tensor 'rnn_speaker/fully_connected/BiasAdd:0' shape=(32, 512) dtype=float32>

In [11]:
def gru_cell():
    return tf.nn.rnn_cell.GRUCell(num_hidden_gru)

inputs = tf.nn.dropout(mlp_output, keep_prob = 0.85)

"""creating the BiDirectional GRU layers"""

for i in range(num_rnn_layers):
    with tf.variable_scope("RNN_BI"+str(i+1)):
        outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell(),
                                                                 gru_cell(),
                                                                 inputs,
                                                                 sequence_length=y_length,
                                                                 initial_state_fw = rnn_speaker,
                                                                 initial_state_bw = rnn_speaker)

        inputs = tf.concat(outputs, axis = 2)

rnn_output = inputs
rnn_output

<tf.Tensor 'RNN_BI4/concat:0' shape=(32, 106, 1024) dtype=float32>

In [12]:
"""Creating FC layer to reduce RNN output dimension to the number of Time Buckets. SOftmax to convert to probabilities """
with tf.variable_scope("CRF_preprocessing"):
    crf_input = tf.contrib.layers.fully_connected(rnn_output,
                                                    num_time_buckets,
                                                    activation_fn= tf.nn.softmax)
    
crf_input

<tf.Tensor 'CRF_preprocessing/fully_connected/Reshape_1:0' shape=(32, 106, 30) dtype=float32>

In [13]:
"""CRF Implementation"""
with tf.variable_scope("CRF"):
    log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(crf_input,
                                                          y_inp, 
                                                          y_length)
    crf_loss = tf.reduce_mean(-log_likelihood)


In [14]:
"""Learning rate and annealing rate are set for the adam optimizer"""

global_step = tf.Variable(0, trainable=False)

starter_learning_rate = 6e-4

learning_rate = tf.train.exponential_decay(starter_learning_rate, 
                                           global_step,
                                           400,
                                           0.9,
                                           staircase=True)

# Passing global_step to minimize() will increment it at each step.
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)

minimize = optimizer.minimize(crf_loss,global_step = global_step)

In [15]:
"""CRF Decoder to get bucketed durations"""
with tf.variable_scope("Decode_Durations"):
    vetrebi_sequence, vetrebi_score = tf.contrib.crf.crf_decode(crf_input,
                                                                transition_params,
                                                                y_length)
vetrebi_sequence

<tf.Tensor 'Decode_Durations/ReverseSequence_1:0' shape=(32, 106) dtype=int32>

In [16]:
def encode_phonemes(a, phonemes):
    """One hot encoding of a phoneme"""
    ans = np.zeros((len(phonemes)))
    ans[phonemes.index(a)] = 1
    return ans
        
def get_phonemes_encoded(durations,phonemes):
    """Returns one hot encoding of all phonemes in a sentence """
    encoded_phonemes = []
    for i in durations:
        phoneme_list = []
        for j in i:
            phoneme_list.append(encode_phonemes(j[0],phonemes))
        for j in range(max_num_phonemes - len(i)):
            phoneme_list.append(encode_phonemes(phonemes[0],phonemes))  ##silence
        encoded_phonemes.append(phoneme_list)
    
    return np.array(encoded_phonemes)                         
                            
    
def get_durations_in_buckets(durations):
    """Algorithm to convert durations to buckets"""
    ans = []
    min_frame_len = 0.01 #### 10ms 
    asn_upper = np.log(.95 * max_phoneme_frames)
    asn_lower = np.log(min_frame_len)
    inc = (asn_upper - asn_lower)/(num_time_buckets-2)

    for i in durations:
        i = list(zip(*i)[1])
        i = [math.ceil((np.log(x)-asn_lower)/inc) for x in i]
        for j in range(max_num_phonemes - len(i)):
            i.append(0)  ##silence
        ans.append(i)
    return np.array(ans)  
      
    
def get_durations(buckets):
    """Returns durations predicted from buckets"""
    ans = []
    min_frame_len = 0.01 #### 10ms 
    asn_upper = np.log(.95 * max_phoneme_frames)
    asn_lower = np.log(min_frame_len)
    inc = (asn_upper - asn_lower)/(num_time_buckets-2)

    for i in buckets:
        ans.append(np.ceil(np.e**((i*inc)+asn_lower)*10))

    return np.array(ans)  


def get_true_durations(frames):
    """Converts frames to ms"""
    return np.array(frames)*10
 

def get_length(durations):
    """returns the number of phonemes in all sentences"""
    seq_lengths = []
    for i in durations:
        seq_lengths.append(len(i))
    return np.array(seq_lengths)

def get_speaker_one_hot(speaker_list, speakers):
    """returns one hot encoding of speakers"""
    one_hot_speaker = np.zeros((len(speaker_list),len(speakers)))
    for i in range(len(speaker_list)):
        one_hot_speaker[i][speakers.index(speaker_list[i])]=1
    return one_hot_speaker


In [17]:
def MOS(test_durations, test_speaker):
    """Returns Mean abolute Error in ms"""
    ptr=0
    no_of_batches=len(test_durations)/batch_size
    error = 0
    total = 0
    for j in range(no_of_batches):
        inp_durations = test_durations[ptr:ptr+batch_size]
        inp_speakers = list(zip(*test_speaker[ptr:ptr+batch_size])[0])

        x_test = get_phonemes_encoded(inp_durations,phonemes)

        y_inp_test = get_durations_in_buckets(inp_durations)

        y_length_test = get_length(inp_durations)

        speaker_selector_test = get_speaker_one_hot(inp_speakers, speakers)

        ptr+=batch_size
    
        pred = sess.run(vetrebi_sequence,{x:x_test, y_length:y_length_test,speaker_selector:speaker_selector_test})
        for i in range(batch_size):
            pred_temp = get_durations(pred[i])[:y_length_test[i]-1]
            true_temp = get_true_durations(list(zip(*inp_durations[i])[1]))[:-1]
            error_temp = np.abs(np.subtract(true_temp,pred_temp))
            error += np.sum(error_temp)
            total += y_length_test[i]
    print inp_durations[0]
    print get_durations(sess.run(vetrebi_sequence,
                       {x:x_test, y_length:y_length_test,speaker_selector:speaker_selector_test})[0])[:y_length_test[0]-1]
    print get_true_durations(list(zip(*inp_durations[0])[1]))[:-1]
    print("MOS:")
    return float(error)/float(total)

In [18]:
"""Initialize session"""

init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

In [19]:
"""For using previous models, restore, else comment"""
saver = tf.train.Saver()
saver.restore(sess, "saved_models/epoch36/model.ckpt")

INFO:tensorflow:Restoring parameters from saved_models/epoch36/model.ckpt


In [20]:
"""For Training only- set number of epochs. Calculates Mean Absolute Error at every epoch
    Uncomment for training"""

# %%time
# epoch = 100
# cur_error=0
# min_error=100


# train_durations, test_durations, train_speaker, test_speaker = train_test_split(durations,speaker_dur, test_size=0.1, random_state=20)
# no_of_batches=len(train_durations)/batch_size

# for i in range(epoch):

#     ptr=0
#     for j in range(no_of_batches):
#         if j%100 == 0:
#             print j
#         inp_durations = train_durations[ptr:ptr+batch_size]

#         inp_speakers = list(zip(*train_speaker[ptr:ptr+batch_size])[0])

#         x_train = get_phonemes_encoded(inp_durations,phonemes)

#         y_inp_train = get_durations_in_buckets(inp_durations)

#         y_length_train = get_length(inp_durations)

#         speaker_selector_train = get_speaker_one_hot(inp_speakers, speakers)

#         ptr+=batch_size
#         sess.run(minimize,{x:x_train, y_inp:y_inp_train, y_length:y_length_train,speaker_selector:speaker_selector_train})
#     print "Epoch - ",str(i)

#     print sess.run(crf_loss,{x:x_train, y_inp:y_inp_train, y_length:y_length_train,speaker_selector:speaker_selector_train})
    
#     cur_error = MOS(test_durations[:10*batch_size], test_speaker[:10*batch_size])

#     print "ERROR- ",str(cur_error)
#     if cur_error < min_error:
# #         save_path = saver.save(sess,"saved_models/epoch"+str(i)+"/model.ckpt")
# #         print "Saved @ ", save_path
#         min_error = cur_error

'For Training only- set number of epochs. Calculates Mean Absolute Error at every epoch\n    Uncomment for training'

In [23]:
def get_durations_of_phonemes(sentence, speaker_index):
    """Takes the phonemes of a sentence along with speaker index and returns predicted durations for every phoneme"""
    sentence.append(silence)
    temp = [[i,i] for i in sentence]
    temp = [temp]*batch_size
    temp = get_phonemes_encoded(temp,phonemes)
    speaker_id = speakers[speaker_index]
    speaker_temp = [speaker_id]*batch_size
    speaker_temp = get_speaker_one_hot(speaker_temp,speakers)
    length_temp = [len(sentence)]*batch_size
    
    print get_durations(sess.run(vetrebi_sequence,
                       {x:temp, y_length:length_temp,speaker_selector:speaker_temp})[0])[:length_temp[0]-1]

[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  41.  14.  41.  86.  86.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  41. 124.  86.  14.
 124.  86.  14.]
[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  41.  14.  41.  86. 124.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  41. 124.  86.  14.
 124.  86.  14.]
[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  41.  14.  41.  86.  86.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  86.  14.  86.  14.
 124.  86.  14.]
[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  41.  14.  41.  14. 124.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  86.  14.  86.  14.
 124.  86.  14.]
[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  41.  14.  41.  86.  86.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  41. 124.  86.  14.
 124.  86.  14.]
[ 41.  14. 124.  28. 124.  14.  41.  86.  14.  86.  14.  86.  14. 124.
  14.  41.  86.  86.  28.  41.  86.  28. 124.  28.  86.  14.  86.  14.
 124.  86.  14.]
[ 41.  14.  86.  14. 124.  14.  28.  86.  14. 

In [None]:
for i in range(10):
    get_durations_of_phonemes(['IH', 'T', 'SIL', 'T', 'ER', 'N', 'D', 'SIL', 'M', 'IY', 'SIL', 'AH', 'G', 'EH', 'N', 'S', 'T',
                         'SIL', 'DH', 'AH', 'SIL', 'HH', 'OW', 'L', 'SIL', 'S', 'IH', 'S', 'T', 'AH', 'M'], i)