<a href="https://colab.research.google.com/github/tymothy6/colab/blob/main/Real_time_(ish)_Whisper_transcription_%5BGradio%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Install dependencies

The OpenAI Python library is used to make API calls to Whisper and GPT. Gradio is used to create the user interface for audio input and transcription outputs.

In [None]:
!pip install gradio
!pip install openai

### 2. Define Whisper and GPT API calls and run the Gradio app:

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
import openai
import gradio as gr

In [None]:
# Whisper API call
def transcribe(audio):
  with open(audio, "rb") as audio_file:
    try:
      response = openai.Audio.transcribe("whisper-1", audio_file)
      transcription = response['text']
      if not transcription:
        return "Error transcribing audio.", ""

      return transcription, ""

    except Exception as e:
      print(f"Error: {e}")
      print(f"API Response: {response.json() if response else 'No response'}")
      return f"An error occurred: {e}", ""


# Post-processing with GPT
def generate_correction(transcription):
  system_prompt = ("You are a helpful assistant. Your task is to correct spelling, grammar, and appropriate punctuation in the transcribed text.")

  response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
          {
              "role": "system",
              "content": system_prompt
          },
          {
              "role":"user",
              "content": transcription
          }
      ]
  )
  return response['choices'][0]['message']['content']

def post_process(transcription):
  return generate_correction(transcription)

def main_function(audio, option):
    transcription, error = transcribe(audio)
    if error:
      return error, ""
    if option == "Speech-to-text":
        return transcription, ""
    elif option == "Speech-to-text with post-processing":
        processed_text = post_process(transcription)
        return transcription, processed_text

# Gradio app

audio_input = gr.Audio(source="microphone", type="filepath", label="Speech input")
option_dropdown = gr.Dropdown(choices=["Speech-to-text", "Speech-to-text with post-processing"], label="Options")
textbox_output = gr.Textbox(label="Text")
textbox_processed_output = gr.Textbox(label="Processed text")

def combined_interface(audio):
  # Retrieve the transcription
  transcription = transcribe(audio)
  # Post-process the transcription
  processed_transcription = post_process(transcription)
  return transcription, processed_transcription

whisper = gr.Interface(
    fn=main_function,
    inputs=[audio_input, option_dropdown],
    outputs=[textbox_output, textbox_processed_output],
    title="Whisper API",
    description="This is a demo of Whisper live transcription with GPT post-processing using the OpenAI API.",
    theme="dark",
    allow_flagging="never", # disables the Flag feature
    live=False  # when this is true the function is called in real-time, without needing to press the button
)

whisper.launch(debug=True)
