In [None]:
import numpy as np

import tensorflow as tf
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio

import scipy.io.wavfile as wav

In [None]:
#Heper function to load frozen graphs
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def, name="prefix")
    return graph

In [None]:
#List of output labels:
# 0 --> "Silence"
# 1 --> "Unknown"
# ...
labels = ["_silence_", "_unknown_", "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]

# Target Model A

In [None]:
model_path = "target_model_A.pb"

print("Loading model graph...")
graph = load_graph(model_path)
sess = tf.Session(graph=graph)
print("Graph loaded!")

input_layer_name   = "prefix/input_audio:0"    #INPUT TENSOR NAME
logits_layer_name  = "prefix/add_3:0"          #LOGITS TENSOR NAME
softmax_layer_name = "prefix/labels_softmax:0" #SOFTMAX TENSOR NAME

input_tensor   = graph.get_tensor_by_name(input_layer_name)   #LOGITS TENSOR
logits_tensor  = graph.get_tensor_by_name(logits_layer_name)  #LOGITS TENSOR
softmax_tensor = graph.get_tensor_by_name(softmax_layer_name) #SOFTMAX TENSOR
print("Tensors restored! Initialization successfully completed!")

#### Load the audio signal:

In [None]:
audio_path = "audio_sample.wav"
fs, audio = wav.read(audio_path)

#Scale the audio signal in the range [-1,1]
scale_factor = 1/(1<<15)
audio_scaled = audio*scale_factor

#### Inference:

In [None]:
softmax = sess.run(softmax_tensor,  feed_dict={input_tensor: audio_scaled.reshape(1,16000)}).flatten()
prediction = np.argmax(softmax)      

print("Predicted class: %d (%s)"%(prediction, labels[prediction]))
print("Softmax:")
print(softmax)

# Target Model B

In [None]:
model_path = "target_model_B.pb"

print("Loading model graph...")
graph = load_graph(model_path)
sess = tf.Session(graph=graph)
print("Graph loaded!")

input_layer_name   = "prefix/wav_data:0"       #INPUT TENSOR NAME
logits_layer_name  = "prefix/add_2:0"          #LOGITS TENSOR NAME
softmax_layer_name = "prefix/labels_softmax:0" #SOFTMAX TENSOR NAME

input_tensor   = graph.get_tensor_by_name(input_layer_name)   #LOGITS TENSOR
logits_tensor  = graph.get_tensor_by_name(logits_layer_name)  #LOGITS TENSOR
softmax_tensor = graph.get_tensor_by_name(softmax_layer_name) #SOFTMAX TENSOR
print("Tensors restored! Initialization successfully completed!")

#### Load the audio signal

In [None]:
audio_path = "audio_sample.wav"
#This model takes as input the data in binary:
audio = open(audio_path, 'rb').read()

#### Inference

In [None]:
softmax = sess.run(softmax_tensor,  feed_dict={input_tensor: audio}).flatten()
prediction = np.argmax(softmax)
print("Predicted class: %d (%s)"%(prediction, labels[prediction]))
print("Softmax:")
print(softmax)