Skip to content

Commit a51b7dc

Browse files
committed
added speech: synthesis via Amazon Polly, output via vlt library (for now)
1 parent 561c347 commit a51b7dc

File tree

5 files changed

+7245
-1
lines changed

5 files changed

+7245
-1
lines changed

ansible/local.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
- { role: tensorflow_models, tags: ['tensorflow_models'] }
1212

13+
- { role: aws_polly, tags: ['aws_polly'] }
14+
1315
- { role: tf_object_detection, tags: ['tf_object_detection,'] }
1416

1517
- { role: provisioning_metadata_end, tags: ['always'] }
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
3+
- name: Install boto3 (AWS SDK for Python)
4+
pip:
5+
name: "{{ item }}"
6+
state: latest
7+
executable: pip3
8+
extra_args: --user
9+
with_items:
10+
- boto3
11+
12+
# TODO: handle credentials

stuff/helper.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from utils import label_map_util
88
from utils import visualization_utils as vis_util
99

10+
from stuff.speech_synthesis import SpeechSynthesizer
11+
1012
# Loading label map (mapping indices to category names, e.g. 5 -> airplane)
1113
NUM_CLASSES = 90
1214
PATH_TO_LABELS = os.path.join('../tensorflow_models/research/object_detection/data', 'mscoco_label_map.pbtxt')
@@ -85,12 +87,14 @@ def cleanup(self):
8587

8688
class Processor:
8789
def __init__(self):
88-
pass
90+
self._speech = SpeechSynthesizer()
8991

9092
def process(self, boxes, scores, classes, num, image_shape):
9193

9294
# TODO: There is the chance of overlapping detections, i.e., a caw and a dog are recognized either with exactly or very similar bounding boxes => filter those?
9395

96+
obj = []
97+
9498
print('*****')
9599
for i in range(boxes.shape[0]):
96100
if scores[i] > 0.5:
@@ -100,8 +104,19 @@ def process(self, boxes, scores, classes, num, image_shape):
100104
class_name = 'N/A'
101105
ymin, xmin, ymax, xmax = tuple(boxes[i].tolist())
102106
(left, right, top, bottom) = (int(xmin * image_shape[1]), int(xmax * image_shape[1]), int(ymin * image_shape[0]), int(ymax * image_shape[0]))
107+
obj.append([class_name, int(100*scores[i]), left, top, right, bottom])
103108
display_str = '{}: {}% at image coordinates (({}, {}) to ({}, {}))'.format(class_name, int(100*scores[i]), left, top, right, bottom)
104109
print(display_str)
105110

111+
def getIndefiniteArticle(word):
112+
"""Simplified way of choosing an or a for the following word; of course, there are many exceptions and not the letter but the sound (vowel vs. consonant) is important.
113+
But hey, for the COCO dataset there should not be any exceptions!
114+
See also https://www.englishclub.com/pronunciation/a-an.htm
115+
"""
116+
return 'an' if word[:1].lower() in 'aeiou' else 'a'
117+
118+
if(len(obj) > 0):
119+
self._speech.request("I am " + str(obj[0][1]) + "% certain I see " + getIndefiniteArticle(obj[0][0]) + " " + obj[0][0])
120+
106121
def cleanup(self):
107122
pass

stuff/speech_synthesis.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""Speech synthesis (resp. TTS) via Amazon Polly
2+
3+
Blablu black magic applied here!
4+
5+
"""
6+
from boto3 import Session
7+
from botocore.exceptions import BotoCoreError, ClientError
8+
from contextlib import closing
9+
import os
10+
import sys
11+
from contextlib import suppress
12+
from threading import Thread
13+
from queue import LifoQueue, Empty
14+
from time import sleep
15+
from tempfile import gettempdir
16+
17+
from stuff import vlc
18+
19+
class SpeechSynthesizer:
20+
def __init__(self):
21+
# Queue holding the last speech utterance
22+
self._speak_queue = LifoQueue(1)
23+
24+
self._session = Session(profile_name="mylespolly")
25+
self._polly = self._session.client("polly", region_name="eu-west-1")
26+
27+
self._thread = Thread(target=self.run, args=())
28+
self._thread.daemon = True
29+
self._thread.start()
30+
31+
def request(self, text):
32+
"""Clear queue (ignore it being empty) and add text, both non-blocking"""
33+
with suppress(Empty):
34+
self._speak_queue.get_nowait()
35+
self._speak_queue.put_nowait(text)
36+
37+
def run(self):
38+
"""Continuously process the queue and trigger speech outputs"""
39+
while True:
40+
text = self._speak_queue.get(True, None)
41+
42+
print(text)
43+
44+
try:
45+
response = self._polly.synthesize_speech(Text=text, OutputFormat="mp3", VoiceId="Salli")
46+
except (BotoCoreError, ClientError) as error:
47+
print(error)
48+
sys.exit(-1)
49+
50+
# Access the audio stream from the response
51+
if "AudioStream" in response:
52+
# Note: Closing the stream is important as the service throttles on the
53+
# number of parallel connections. Here we are using contextlib.closing to
54+
# ensure the close method of the stream object will be called automatically
55+
# at the end of the with statement's scope.
56+
with closing(response["AudioStream"]) as stream:
57+
output = os.path.join(gettempdir(), "speech.mp3")
58+
print(output)
59+
try:
60+
# Open a file for writing the output as a binary stream
61+
with open(output, "wb") as file:
62+
file.write(stream.read())
63+
except IOError as error:
64+
# Could not write to file, exit gracefully
65+
print(error)
66+
sys.exit(-1)
67+
else:
68+
# The response didn't contain audio data, exit gracefully
69+
print("Could not stream audio")
70+
sys.exit(-1)
71+
72+
# Play the audio using VLC
73+
# see https://wiki.videolan.org/Python_bindings
74+
# see https://www.olivieraubert.net/vlc/python-ctypes/doc/index.html
75+
p = vlc.MediaPlayer(output)
76+
sleep(0.1)
77+
p.play()
78+
sleep(0.1)
79+
while p.is_playing():
80+
pass
81+
# os.remove(output)
82+
83+
84+
## alternative:
85+
#from pygame import mixer
86+
#mixer.init(frequency=22050, size=-16, channels=2, buffer=4096)
87+
#mixer.music.load(output)
88+
#mixer.music.play()
89+
#while mixer.music.get_busy():
90+
# pass
91+
#mixer.quit()

0 commit comments

Comments
 (0)