In [1]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import tensorflow as tf
np.set_printoptions(threshold=np.nan)

  from ._conv import register_converters as _register_converters


In [2]:
max_num_phonemes = 10   #get from data... Setting to 10 for example only
batch_size = 32
speak_feats = 16 #features in speaker embedding
phoneme_feats = 16
num_time_buckets = 250 ##need to set this to the right quantity

# with open("dataset/speakers_count.pkl") as f:
#     max_frames, speakers, count = pickle.load(f)
#     print len(speakers)  #number of speakers
#     print count    #no. of dataset files

phoneme_count = 40   #take form list

num_speakers = 108  #Get from file 

mlp_layers = 4 #number of layers in MLP
mlp_units = 256 #number of units in each MLP layer

num_rnn_layers = 3  #number of layers in RNN network
num_hidden_gru = 512 #number of hidden units in each RNN layer

output_size = 32

conv_filter_widths = [3,6,15,30]

dataset_mean = 0 ##READ FROM FILE
dataset_std_dev = 1 ##READ FROM FILE

max_frames = 0  #get from duration prediction model

#phonemes
silence = "SIL"

phonemes = [silence,'AA','AE','AH','AO','AW','AY','B','CH','D','DH','EH',
            'ER','EY','F','G','HH','IH','IY','JH','K','L','M','N','NG','OW','OY',
            'P','R','S','SH','T','TH','UH','UW','V','W','Y','Z','ZH']

In [3]:
x = tf.placeholder(tf.float32, [batch_size, max_num_phonemes,phoneme_count]) #Phonemes whose frequency is to be predicted, one phoeneme at a time

y_inp = tf.placeholder(tf.float32, [batch_size,max_num_phonemes])  #ground truth Fundamental Frequencies

y_length = tf.placeholder(tf.int32, [batch_size]) #stores length of sequence of phonemes for the batch

speaker_selector = tf.placeholder(tf.float32, [batch_size, num_speakers]) #One hot speaker selector matrix

In [4]:
#Create speaker embedding
with tf.variable_scope("speaker_embedding"):
    speaker = tf.get_variable("speaker",
                              shape=(num_speakers,speak_feats),
                              dtype= tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer(),
                              trainable=True)
    speaker_matrix = tf.matmul(speaker_selector, speaker)
speaker_matrix

<tf.Tensor 'speaker_embedding/MatMul:0' shape=(32, 16) dtype=float32>

In [5]:
with tf.variable_scope("phoneme_embedding"):
    phoneme_emb = tf.get_variable("phoneme_emb",
                              shape=(len(phonemes),phoneme_feats),
                              dtype= tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer(),
                              trainable=True)
    unstacked_x = tf.reshape(x, [-1,len(phonemes)])
    phoneme_array = tf.matmul(unstacked_x, phoneme_emb)
    phoneme_matrix = tf.reshape(phoneme_array,[batch_size, max_num_phonemes,phoneme_feats])
phoneme_matrix

<tf.Tensor 'phoneme_embedding/Reshape_1:0' shape=(32, 10, 16) dtype=float32>

In [6]:
#fully connected layer that takes speaker as the input and sends state to the Bidirectional Gru layers
with tf.variable_scope("rnn_speaker"):
    rnn_speaker_outputs = num_hidden_gru
    rnn_speaker = tf.contrib.layers.fully_connected(speaker_matrix,
                                                    rnn_speaker_outputs,
                                                    activation_fn= None)
rnn_speaker

<tf.Tensor 'rnn_speaker/fully_connected/BiasAdd:0' shape=(32, 512) dtype=float32>

In [7]:
def gru_cell():
    return tf.nn.rnn_cell.GRUCell(num_hidden_gru)

inputs = x

#creating the BiDirectional GRU layers
for i in range(num_rnn_layers):
    with tf.variable_scope("RNN_BI"+str(i+1)):
        output_rnn1, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell(),
                                                                 gru_cell(),
                                                                 inputs,
                                                                 initial_state_fw = rnn_speaker,
                                                                 initial_state_bw = rnn_speaker)
        inputs = tf.concat(output_rnn1, axis = 2)
        
rnn_output1 = inputs
rnn_output1

<tf.Tensor 'RNN_BI3/concat:0' shape=(32, 10, 1024) dtype=float32>

In [8]:
#Fully connected layer for second BI-GRU layer
with tf.variable_scope("FC_1"):

    omega = tf.contrib.layers.fully_connected(rnn_output1,
                                                    1,
                                                    activation_fn= tf.nn.sigmoid)
    
omega

<tf.Tensor 'FC_1/fully_connected/Sigmoid:0' shape=(32, 10, 1) dtype=float32>

In [9]:
#Voice prediction
with tf.variable_scope("Voice_Prob"):
    prob_voiced = tf.contrib.layers.fully_connected(rnn_output1,
                                                    1,
                                                    activation_fn= tf.nn.sigmoid)
    
prob_voiced

<tf.Tensor 'Voice_Prob/fully_connected/Sigmoid:0' shape=(32, 10, 1) dtype=float32>

In [10]:
with tf.variable_scope("BI_GRU"):
    g_cell1 = gru_cell()
    g_cell2 = gru_cell()
    output_rnn2, output_state = tf.nn.bidirectional_dynamic_rnn(g_cell1,
                                             g_cell2,
                                             rnn_output1,
                                             initial_state_fw = g_cell1.zero_state(batch_size, tf.float32),
                                             initial_state_bw = g_cell2.zero_state(batch_size, tf.float32))
    output_rnn2 = tf.concat(output_rnn2, axis = 2)
output_rnn2

<tf.Tensor 'BI_GRU/concat:0' shape=(32, 10, 1024) dtype=float32>

In [11]:
#Fully connected layer for second BI-GRU layer
with tf.variable_scope("FC_2"):
    num_output_rnn2 = output_size
    f_GRU = tf.contrib.layers.fully_connected(output_rnn2,
                                                    1,
                                                    activation_fn= None)
    
f_GRU

<tf.Tensor 'FC_2/fully_connected/BiasAdd:0' shape=(32, 10, 1) dtype=float32>

In [12]:
#Speaker embedding for mean prediction
with tf.variable_scope("Speaker_Mean"):
    speaker_mean = tf.contrib.layers.fully_connected(speaker_matrix,
                                                    1,
                                                    activation_fn= tf.nn.softsign) + 1
speaker_mean

<tf.Tensor 'Speaker_Mean/add:0' shape=(32, 1) dtype=float32>

In [13]:
#Speaker embedding for std dev prediction
with tf.variable_scope("Speaker_Std_Dev"):
    speaker_std_dev = tf.contrib.layers.fully_connected(speaker_matrix,
                                                    1,
                                                    activation_fn= tf.nn.softsign) + 1
speaker_std_dev

<tf.Tensor 'Speaker_Std_Dev/add:0' shape=(32, 1) dtype=float32>

In [14]:
#Filter bank with convolutions of different widths
with tf.variable_scope("Filter_Bank"):
    filter_bank = []
#    conv_input = tf.expand_dims(rnn_output1, axis = 3)
    for i in conv_filter_widths:
        filterx = tf.get_variable("filter"+str(i), shape=[i, num_hidden_gru*2,1],
           initializer=tf.contrib.layers.xavier_initializer())
        conv =  tf.nn.conv1d(rnn_output1,
                             filterx,
                             1,
                             "SAME")
        filter_bank.append(conv)


f_conv = tf.add_n(filter_bank)

f_conv

<tf.Tensor 'AddN:0' shape=(32, 10, 1) dtype=float32>

In [15]:
gamma = 1-omega

freq = tf.add(tf.multiply(omega,f_GRU), tf.multiply(gamma,f_conv))
freq

<tf.Tensor 'Add:0' shape=(32, 10, 1) dtype=float32>

In [16]:
std_dev = tf.Variable([dataset_std_dev],dtype = tf.float32)
mean = tf.Variable([dataset_mean],dtype = tf.float32)

In [24]:
with tf.variable_scope("Frequency_Prediction"):
    sp_std_dev = tf.expand_dims(speaker_std_dev,0)
    sp_std_dev = tf.contrib.seq2seq.tile_batch(sp_std_dev,multiplier = max_num_phonemes)
    sp_std_dev = tf.transpose(sp_std_dev,[1,0,2])

    temp1 = tf.multiply(sp_std_dev,freq)
    
    temp1 = tf.scalar_mul(
        std_dev[0],
        temp1
    )
    sp_mean = tf.expand_dims(speaker_mean,0)
    sp_mean = tf.contrib.seq2seq.tile_batch(sp_mean,multiplier = max_num_phonemes)
    sp_mean = tf.transpose(sp_mean,[1,0,2])

    temp2 = tf.scalar_mul(mean[0], sp_mean)
    
    FO_profile = tf.add(temp2, sp_mean)
    
F0_profile

Tensor("Frequency_Prediction_7/Add:0", shape=(32, 10, 1), dtype=float32)
