<b>Talk to Gemini with the Speech-to-Text API</b>

Having a spoken conversation with Gemini, Google's latest and most advanced model, is simple in a Colab notebook.

In [4]:
#@title Install Google Cloud's speech library

!pip install -q google-cloud-speech
from google.cloud import speech


<b>[Required] Set up a Google Cloud account</b>

Okay so we get it, this part is hard, but in order to use the Cloud speech-to-text API you need to set up a Cloud account, project, and billing. Start [here](https://console.cloud.google.com/getting-started).

Once you've done that, come back here.

In [5]:
#@title Authenticate with Google Cloud and your project ID

from google.colab import auth

gcp_project_id = '' # @param {type: "string"}

auth.authenticate_user(project_id=gcp_project_id)

CalledProcessError: Command '['gcloud', 'config', 'set', 'project', '']' returned non-zero exit status 1.

In [None]:
#@title [Run once per project] Enable the Google Cloud speech-to-text API

!gcloud services enable speech.googleapis.com

In [None]:
#@title Configure Gemini API key

#Access your Gemini API key

import google.generativeai as genai
from google.colab import userdata

gemini_api_secret_name = 'GOOGLE_API_KEY'  # @param {type: "string"}

try:
  GOOGLE_API_KEY=userdata.get(gemini_api_secret_name)
  genai.configure(api_key=GOOGLE_API_KEY)
except userdata.SecretNotFoundError as e:
   print(f'Secret not found\n\nThis expects you to create a secret named {gemini_api_secret_name} in Colab\n\nVisit https://makersuite.google.com/app/apikey to create an API key\n\nStore that in the secrets section on the left side of the notebook (key icon)\n\nName the secret {gemini_api_secret_name}')
   raise e
except userdata.NotebookAccessError as e:
  print(f'You need to grant this notebook access to the {gemini_api_secret_name} secret in order for the notebook to access Gemini on your behalf.')
  raise e
except Exception as e:
  # unknown error
  print(f"There was an unknown error. Ensure you have a secret {gemini_api_secret_name} stored in Colab and it's a valid key from https://makersuite.google.com/app/apikey")
  raise e

model = genai.GenerativeModel('gemini-pro')

In [None]:
#@title Setup

# noting here that a lot of this code is forked from https://codelabs.developers.google.com/codelabs/cloud-speech-text-python3#0

# set up cloud speech detection functions

from google.cloud import speech

def speech_to_text(
    config: speech.RecognitionConfig,
    audio: speech.RecognitionAudio,
) -> speech.RecognizeResponse:
    client = speech.SpeechClient()

    # Synchronous speech recognition request
    response = client.recognize(config=config, audio=audio)

    return response

def print_response(response: speech.RecognizeResponse):
    for result in response.results:
        print_result(result)

def print_result(result: speech.SpeechRecognitionResult):
    best_alternative = result.alternatives[0]
    print("-" * 80)
    print(f"language_code: {result.language_code}")
    print(f"transcript:    {best_alternative.transcript}")
    print(f"confidence:    {best_alternative.confidence:.0%}")

# config for speech recognition; modify language here & other params
config = speech.RecognitionConfig(
    language_code="en",
    enable_automatic_punctuation=True,
)

# required set up to enable recording audio in your browser

!pip install ipywebrtc
import io
from ipywebrtc import AudioRecorder, CameraStream

# required in Colab to enable 3rd party widgets
from google.colab import output
output.enable_custom_widget_manager()

# set up helper functions for displaying text nicely

from IPython.display import Markdown
import textwrap

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


In [None]:
#@title Record your speech

# create a microphone stream
camera = CameraStream(constraints={'audio': True, 'video':False})

# create an audio recorder that uses the microphone stream
recorder = AudioRecorder(stream=camera)

# display the recorder widget
recorder

In [None]:
#@title Transcribe and send to Gemini

recorded_audio = recorder.audio.value

# if you ever want to save the output, uncomment the next two lines
#with open("output.wav", "wb") as f:
#    f.write(recorder.audio.value)

audio = speech.RecognitionAudio(
    content=recorded_audio,
)

processing_results = speech_to_text(config, audio)
audio_text = processing_results.results[0].alternatives[0].transcript

response = model.generate_content(audio_text)

to_markdown(f'**You**: {audio_text}\n\n**Gemini**:\n{response.text}')