Return audio response as mp3

tronikos · Jan 6, 2023 · dc27db4 · dc27db4
1 parent fca6adb
commit dc27db4
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ pytest
 
 # Run command line interactive tool
 python -m pip install click
-python demo.py
+python demo.py --display --audio_out
 
 # Build package
 python -m pip install build

diff --git a/browser_helpers.py b/browser_helpers.py
@@ -16,18 +16,16 @@
 import tempfile
 import webbrowser
 
-ASSISTANT_HTML_FILE = 'google-assistant-sdk-screen-out.html'
-
 
 class SystemBrowser(object):
     def __init__(self):
         self.tempdir = tempfile.mkdtemp()
-        self.filename = os.path.join(self.tempdir, ASSISTANT_HTML_FILE)
 
-    def display(self, html):
-        with open(self.filename, 'wb') as f:
-            f.write(html)
-        webbrowser.open(self.filename, new=0)
+    def display(self, contents, filename):
+        full_filename = os.path.join(self.tempdir, filename)
+        with open(full_filename, 'wb') as f:
+            f.write(contents)
+        webbrowser.open(full_filename, new=0)
 
 
 system_browser = SystemBrowser()
diff --git a/demo.py b/demo.py
@@ -47,14 +47,17 @@
               help='Language code of the Assistant')
 @click.option('--display', is_flag=True, default=False,
               help='Enable visual display of Assistant responses in HTML.')
+@click.option('--audio_out', is_flag=True, default=False,
+              help='Enable audio response.')
 @click.option('--verbose', '-v', is_flag=True, default=False,
               help='Verbose logging.')
 @click.option('--grpc-deadline', default=DEFAULT_GRPC_DEADLINE,
               metavar='<grpc deadline>', show_default=True,
               help='gRPC deadline in seconds')
 def main(api_endpoint, credentials,
-         device_model_id, device_id, lang, display, verbose,
+         device_model_id, device_id, lang, display, audio_out, verbose,
          grpc_deadline, *args, **kwargs):
+    system_browser = browser_helpers.system_browser
     # Setup logging.
     logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
 
@@ -71,17 +74,18 @@ def main(api_endpoint, credentials,
                       'new OAuth 2.0 credentials.')
         return
 
-    with TextAssistant(credentials, lang, device_model_id, device_id, display,
+    with TextAssistant(credentials, lang, device_model_id, device_id, display, audio_out,
                        grpc_deadline, api_endpoint) as assistant:
         while True:
             query = click.prompt('')
             click.echo('<you> %s' % query)
-            response_text, response_html = assistant.assist(text_query=query)
-            if display and response_html:
-                system_browser = browser_helpers.system_browser
-                system_browser.display(response_html)
+            response_text, response_html, audio_response = assistant.assist(text_query=query)
             if response_text:
                 click.echo('<@assistant> %s' % response_text)
+            if response_html:
+                system_browser.display(response_html, 'google-assistant-sdk-screen-out.html')
+            if audio_response:
+                system_browser.display(audio_response, 'google-assistant-sdk-audio-out.mp3')
 
 
 if __name__ == '__main__':

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "gassist-text"
-version = "0.0.7"
+version = "0.0.8"
 authors = [
     { name="tronikos", email="tronikos@gmail.com" },
 ]

diff --git a/src/gassist_text/textinput.py b/src/gassist_text/textinput.py
@@ -21,6 +21,7 @@
 #   - Added default values
 #   - Moved creation of the authorized gRPC channel in the constructor
 # - Parse HTML response
+# - Return audio response as mp3
 # - Extracted command line tool to demo.py
 
 import google.auth.transport.grpc
@@ -55,19 +56,22 @@ class TextAssistant(object):
       device_model_id: identifier of the device model.
       device_id: identifier of the registered device instance.
       display: enable visual display of assistant response.
+      audio_out: enable audio response.
       deadline_sec: gRPC deadline in seconds for Google Assistant API call.
       api_endpoint: Address of Google Assistant API service.
     """
 
     def __init__(self, credentials, language_code='en-US', device_model_id='default', device_id='default',
-                 display=False, deadline_sec=DEFAULT_GRPC_DEADLINE, api_endpoint=ASSISTANT_API_ENDPOINT):
+                 display=False, audio_out=False,
+                 deadline_sec=DEFAULT_GRPC_DEADLINE, api_endpoint=ASSISTANT_API_ENDPOINT):
         self.language_code = language_code
         self.device_model_id = device_model_id
         self.device_id = device_id
         self.conversation_state = None
         # Force reset of first conversation.
         self.is_new_conversation = True
         self.display = display
+        self.audio_out = audio_out
         # Create an authorized gRPC channel.
         channel = google.auth.transport.grpc.secure_authorized_channel(
             credentials, google.auth.transport.requests.Request(), api_endpoint)
@@ -84,14 +88,13 @@ def __exit__(self, etype, e, traceback):
             return False
 
     def assist(self, text_query):
-        """Send a text request to the Assistant and playback the response.
-        """
+        """Send a text request to the Assistant and return the response as a tuple of: [text, html, audio]."""
         def iter_assist_requests():
             config = embedded_assistant_pb2.AssistConfig(
                 audio_out_config=embedded_assistant_pb2.AudioOutConfig(
-                    encoding='LINEAR16',
-                    sample_rate_hertz=16000,
-                    volume_percentage=0,
+                    encoding='MP3',
+                    sample_rate_hertz=24000,
+                    volume_percentage=100,
                 ),
                 dialog_state_in=embedded_assistant_pb2.DialogStateIn(
                     language_code=self.language_code,
@@ -113,18 +116,21 @@ def iter_assist_requests():
 
         text_response = None
         html_response = None
+        audio_response = b''
         for resp in self.assistant.Assist(iter_assist_requests(),
                                           self.deadline):
             assistant_helpers.log_assist_response_without_audio(resp)
             if resp.screen_out.data:
                 if self.display:
                     html_response = resp.screen_out.data
                 soup = BeautifulSoup(resp.screen_out.data, "html.parser")
-                divs = soup.find_all("div", class_="show_text_content")
-                text_response = '\n'.join(map(lambda div : div.text, divs))
+                divs = soup.find_all("div", id="assistant-card-content")
+                text_response = '\n'.join(map(lambda div : div.text, divs)).strip()
             if resp.dialog_state_out.conversation_state:
                 conversation_state = resp.dialog_state_out.conversation_state
                 self.conversation_state = conversation_state
             if resp.dialog_state_out.supplemental_display_text:
                 text_response = resp.dialog_state_out.supplemental_display_text
-        return text_response, html_response
+            if self.audio_out and resp.audio_out.audio_data:
+                audio_response += resp.audio_out.audio_data
+        return text_response, html_response, audio_response