added speech: synthesis via Amazon Polly, output via vlt library (for now)

sjentzsch · sjentzsch · commit a51b7dc42579 · 2017-10-30T15:35:32.000+01:00
diff --git a/ansible/local.yml b/ansible/local.yml
@@ -10,6 +10,8 @@
 
   - { role: tensorflow_models, tags: ['tensorflow_models'] }
 
+  - { role: aws_polly, tags: ['aws_polly'] }
+
   - { role: tf_object_detection, tags: ['tf_object_detection,'] }
 
   - { role: provisioning_metadata_end, tags: ['always'] }
diff --git a/ansible/roles/aws_polly/tasks/main.yml b/ansible/roles/aws_polly/tasks/main.yml
@@ -0,0 +1,12 @@
+---
+
+- name: Install boto3 (AWS SDK for Python)
+  pip:
+    name: "{{ item }}"
+    state: latest
+    executable: pip3
+    extra_args: --user
+  with_items:
+    - boto3
+
+# TODO: handle credentials
diff --git a/stuff/helper.py b/stuff/helper.py
@@ -7,6 +7,8 @@
 from utils import label_map_util
 from utils import visualization_utils as vis_util
 
+from stuff.speech_synthesis import SpeechSynthesizer
+
 # Loading label map (mapping indices to category names, e.g. 5 -> airplane)
 NUM_CLASSES = 90
 PATH_TO_LABELS = os.path.join('../tensorflow_models/research/object_detection/data', 'mscoco_label_map.pbtxt')
@@ -85,12 +87,14 @@ def cleanup(self):
 
 class Processor:
     def __init__(self):
-        pass
+        self._speech = SpeechSynthesizer()
 
     def process(self, boxes, scores, classes, num, image_shape):
 
         # TODO: There is the chance of overlapping detections, i.e., a caw and a dog are recognized either with exactly or very similar bounding boxes => filter those?
 
+        obj = []
+
         print('*****')
         for i in range(boxes.shape[0]):
           if scores[i] > 0.5:
@@ -100,8 +104,19 @@ def process(self, boxes, scores, classes, num, image_shape):
               class_name = 'N/A'
             ymin, xmin, ymax, xmax = tuple(boxes[i].tolist())
             (left, right, top, bottom) = (int(xmin * image_shape[1]), int(xmax * image_shape[1]), int(ymin * image_shape[0]), int(ymax * image_shape[0]))
+            obj.append([class_name, int(100*scores[i]), left, top, right, bottom])
             display_str = '{}: {}% at image coordinates (({}, {}) to ({}, {}))'.format(class_name, int(100*scores[i]), left, top, right, bottom)
             print(display_str)
 
+        def getIndefiniteArticle(word):
+            """Simplified way of choosing an or a for the following word; of course, there are many exceptions and not the letter but the sound (vowel vs. consonant) is important.
+            But hey, for the COCO dataset there should not be any exceptions!
+            See also https://www.englishclub.com/pronunciation/a-an.htm
+            """
+            return 'an' if word[:1].lower() in 'aeiou' else 'a'
+
+        if(len(obj) > 0):
+            self._speech.request("I am " + str(obj[0][1]) + "% certain I see " + getIndefiniteArticle(obj[0][0]) + " " + obj[0][0])
+
     def cleanup(self):
         pass
diff --git a/stuff/speech_synthesis.py b/stuff/speech_synthesis.py
@@ -0,0 +1,91 @@
+"""Speech synthesis (resp. TTS) via Amazon Polly
+
+Blablu black magic applied here!
+
+"""
+from boto3 import Session
+from botocore.exceptions import BotoCoreError, ClientError
+from contextlib import closing
+import os
+import sys
+from contextlib import suppress
+from threading import Thread
+from queue import LifoQueue, Empty
+from time import sleep
+from tempfile import gettempdir
+
+from stuff import vlc
+
+class SpeechSynthesizer:
+    def __init__(self):
+        # Queue holding the last speech utterance
+        self._speak_queue = LifoQueue(1)
+
+        self._session = Session(profile_name="mylespolly")
+        self._polly = self._session.client("polly", region_name="eu-west-1")
+
+        self._thread = Thread(target=self.run, args=())
+        self._thread.daemon = True
+        self._thread.start()
+
+    def request(self, text):
+        """Clear queue (ignore it being empty) and add text, both non-blocking"""
+        with suppress(Empty):
+            self._speak_queue.get_nowait()
+        self._speak_queue.put_nowait(text)
+
+    def run(self):
+        """Continuously process the queue and trigger speech outputs"""
+        while True:
+            text = self._speak_queue.get(True, None)
+
+            print(text)
+
+            try:
+                response = self._polly.synthesize_speech(Text=text, OutputFormat="mp3", VoiceId="Salli")
+            except (BotoCoreError, ClientError) as error:
+                print(error)
+                sys.exit(-1)
+
+            # Access the audio stream from the response
+            if "AudioStream" in response:
+                # Note: Closing the stream is important as the service throttles on the
+                # number of parallel connections. Here we are using contextlib.closing to
+                # ensure the close method of the stream object will be called automatically
+                # at the end of the with statement's scope.
+                with closing(response["AudioStream"]) as stream:
+                    output = os.path.join(gettempdir(), "speech.mp3")
+                    print(output)
+                    try:
+                        # Open a file for writing the output as a binary stream
+                        with open(output, "wb") as file:
+                            file.write(stream.read())
+                    except IOError as error:
+                        # Could not write to file, exit gracefully
+                        print(error)
+                        sys.exit(-1)
+            else:
+                # The response didn't contain audio data, exit gracefully
+                print("Could not stream audio")
+                sys.exit(-1)
+
+            # Play the audio using VLC
+            # see https://wiki.videolan.org/Python_bindings
+            # see https://www.olivieraubert.net/vlc/python-ctypes/doc/index.html
+            p = vlc.MediaPlayer(output)
+            sleep(0.1)
+            p.play()
+            sleep(0.1)
+            while p.is_playing():
+                pass
+#            os.remove(output)
+
+
+## alternative:
+#from pygame import mixer
+#mixer.init(frequency=22050, size=-16, channels=2, buffer=4096)
+#mixer.music.load(output)
+#mixer.music.play()
+#while mixer.music.get_busy():
+#    pass
+#mixer.quit()
diff --git a/stuff/vlc.py b/stuff/vlc.py