Speech keyword detector tutorial

Adds a basic training script for a simple audio model to our examples. See third_party/docs_src/tutorials/audio_recognition.md for full documentation PiperOrigin-RevId: 165025732
tensorflow · Aug 11, 2017 · 0c6fd17 · 0c6fd17
1 parent e9a8d75
commit 0c6fd17
Show file tree

Hide file tree

Showing 28 changed files with 4,321 additions and 13 deletions.
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
@@ -370,6 +370,7 @@ filegroup(
         "//tensorflow/examples/label_image:all_files",
         "//tensorflow/examples/learn:all_files",
         "//tensorflow/examples/saved_model:all_files",
+        "//tensorflow/examples/speech_commands:all_files",
         "//tensorflow/examples/tutorials/estimators:all_files",
         "//tensorflow/examples/tutorials/mnist:all_files",
         "//tensorflow/examples/tutorials/word2vec:all_files",

diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
@@ -28,6 +28,7 @@ tf_custom_op_py_library(
         "python/framework/tensor_util.py",
         "python/ops/__init__.py",
         "python/ops/arg_scope.py",
+        "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
@@ -50,6 +51,7 @@ tf_custom_op_py_library(
         ":gen_variable_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:audio_ops_gen",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",

diff --git a/tensorflow/contrib/framework/python/ops/audio_ops.py b/tensorflow/contrib/framework/python/ops/audio_ops.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=g-short-docstring-punctuation
+"""Audio processing and decoding ops.
+
+@@decode_wav
+@@encode_wav
+@@audio_spectrogram
+@@mfcc
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_audio_ops import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__, [])
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
@@ -118,6 +118,17 @@ Status ReadValue(const string& data, T* value, int* offset) {
   return Status::OK();
 }
 
+Status ReadString(const string& data, int expected_length, string* value,
+                  int* offset) {
+  const int new_offset = *offset + expected_length;
+  if (new_offset > data.size()) {
+    return errors::InvalidArgument("Data too short when trying to read string");
+  }
+  *value = string(data.begin() + *offset, data.begin() + new_offset);
+  *offset = new_offset;
+  return Status::OK();
+}
+
 }  // namespace
 
 Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
@@ -254,17 +265,33 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
     // Skip over this unused section.
     offset += 2;
   }
-  TF_RETURN_IF_ERROR(ExpectText(wav_string, kDataChunkId, &offset));
-  uint32 data_size;
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &data_size, &offset));
-  *sample_count = data_size / bytes_per_sample;
-  const uint32 data_count = *sample_count * *channel_count;
-  float_values->resize(data_count);
-  for (int i = 0; i < data_count; ++i) {
-    int16 single_channel_value = 0;
-    TF_RETURN_IF_ERROR(
-        ReadValue<int16>(wav_string, &single_channel_value, &offset));
-    (*float_values)[i] = Int16SampleToFloat(single_channel_value);
+
+  bool was_data_found = false;
+  while (offset < wav_string.size()) {
+    string chunk_id;
+    TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &chunk_id, &offset));
+    uint32 chunk_size;
+    TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &chunk_size, &offset));
+    if (chunk_id == kDataChunkId) {
+      if (was_data_found) {
+        return errors::InvalidArgument("More than one data chunk found in WAV");
+      }
+      was_data_found = true;
+      *sample_count = chunk_size / bytes_per_sample;
+      const uint32 data_count = *sample_count * *channel_count;
+      float_values->resize(data_count);
+      for (int i = 0; i < data_count; ++i) {
+        int16 single_channel_value = 0;
+        TF_RETURN_IF_ERROR(
+            ReadValue<int16>(wav_string, &single_channel_value, &offset));
+        (*float_values)[i] = Int16SampleToFloat(single_channel_value);
+      }
+    } else {
+      offset += chunk_size;
+    }
+  }
+  if (!was_data_found) {
+    return errors::InvalidArgument("No data chunk found in WAV");
   }
   return Status::OK();
 }

diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
@@ -62,7 +62,7 @@ Status DecodeWavShapeFn(InferenceContext* c) {
 Status EncodeWavShapeFn(InferenceContext* c) {
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
   c->set_output(0, c->Scalar());
   return Status::OK();
 }
@@ -104,7 +104,7 @@ Status MfccShapeFn(InferenceContext* c) {
   ShapeHandle spectrogram;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &spectrogram));
   ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
 
   int32 dct_coefficient_count;
   TF_RETURN_IF_ERROR(