<a href="https://colab.research.google.com/github/tymothy6/colab/blob/main/Real_time_(ish)_Whisper_transcription_%5BGradio%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Install dependencies

The OpenAI Python library is used to make API calls to Whisper and GPT. Gradio is used to create the user interface for audio input and transcription outputs. The PyDub library is used to chunk the recorded audio into segments to mimic real-time transcription.

In [None]:
%pip install gradio
%pip install openai
%pip install pydub

### 2. Define Whisper and GPT API calls and run the Gradio app

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
import openai
import gradio as gr

In [None]:
# Whisper API call
def transcribe(audio):
  with open(audio, "rb") as audio_file:
    try:
      response = openai.Audio.transcribe("whisper-1", audio_file)
      transcription = response['text']
      if not transcription:
        return "Error transcribing audio.", ""

      return transcription, ""

    except Exception as e:
      print(f"Error: {e}")
      print(f"API Response: {response.json() if response else 'No response'}")
      return f"An error occurred: {e}", ""


# Post-processing with GPT4
def generate_correction(transcription):
  system_prompt = ("You are a helpful assistant. Your task is to correct spelling, grammar, and appropriate punctuation in the transcribed text.")

  response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
          {
              "role": "system",
              "content": system_prompt
          },
          {
              "role":"user",
              "content": transcription
          }
      ]
  )
  return response['choices'][0]['message']['content']

def post_process(transcription):
  return generate_correction(transcription)

def main_function(audio, option):
    transcription, error = transcribe(audio)
    if error:
      return error, ""
    if option == "Speech-to-text":
        return transcription, ""
    elif option == "Speech-to-text with post-processing":
        processed_text = post_process(transcription)
        return transcription, processed_text

# Gradio app

audio_input = gr.Audio(source="microphone", type="filepath", label="Speech input")
option_dropdown = gr.Dropdown(choices=["Speech-to-text", "Speech-to-text with post-processing"], label="Options", value="Speech-to-text")
textbox_output = gr.Textbox(label="Text")
textbox_processed_output = gr.Textbox(label="Processed text")


whisper = gr.Interface(
    fn=main_function,
    inputs=[audio_input, option_dropdown],
    outputs=[textbox_output, textbox_processed_output],
    title="Whisper API",
    description="This is a demo of Whisper live transcription with GPT post-processing using the OpenAI API.",
    allow_flagging="never", # disables the Flag feature
    live=False  # when this is true the function is called in real-time, without needing to press the button
)

whisper.launch(debug=True)


### 3. Real-time transcription using Gradio state variables


We use the `live` parameter of `gr.Interface` combined with state to mimic real-time transcription with the Whisper API. The idea is to periodically process audio chunks and accumulate the transcribed text without disrupting the user experience.

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
import openai
import gradio as gr
from pydub import AudioSegment

def transcribe_from_point(audio, last_point):
    # Error handling when audio is empty
    if not audio:
        print("No audio file provided.")
        return "", last_point
    # Load the audio file from the last point
    audio_segment = AudioSegment.from_wav(audio)[last_point:]
    
    # If the new chunk is empty or too small, return without transcribing
    if len(audio_segment) < 1000:  # less than 1 second
        return "", last_point

    # Save the new chunk to a temporary file for transcription
    chunk_filename = "temp_chunk.wav"
    audio_segment.export(chunk_filename, format="wav")

    # Transcribe this chunk
    with open(chunk_filename, "rb") as audio_file:
        try:
            response = openai.Audio.transcribe("whisper-1", audio_file)
            os.remove(chunk_filename)  # delete the temporary file
            if response['text']:
                return response['text'], last_point + len(audio_segment)
            else:
                return "", last_point
        except Exception as e:
            print(f"Error: {e}")
            os.remove(chunk_filename)  # delete the temporary file
            return f"An error occurred: {e}", last_point
        
# Post-processing with GPT4
def generate_correction(transcription):
  system_prompt = ("You are a helpful assistant. Your task is to correct spelling, grammar, and appropriate punctuation in the transcribed text.")

  response = openai.ChatCompletion.create(
      model="gpt-4",
      messages=[
          {
              "role": "system",
              "content": system_prompt
          },
          {
              "role":"user",
              "content": transcription
          }
      ]
  )
  return response['choices'][0]['message']['content']

def post_process(transcription):
  return generate_correction(transcription)

def main_function(audio, option, last_point=0):
    chunk_transcription, updated_point = transcribe_from_point(audio, last_point)
    
    if option == "Speech-to-text":
        return chunk_transcription, "", updated_point
    elif option == "Speech-to-text with post-processing":
        total_transcription = post_process(chunk_transcription)
        return chunk_transcription, total_transcription, updated_point

# Gradio component definition
audio_input = gr.Audio(source="microphone", type="filepath", label="Speech input")
option_dropdown = gr.Dropdown(choices=["Speech-to-text", "Speech-to-text with post-processing"], label="Options", value="Speech-to-text")
textbox_output = gr.Textbox(label="Transcript")
textbox_processed_output = gr.Textbox(label="Processed transcript")
last_processed_point = gr.State(value=0)

whisper = gr.Interface(
    fn=main_function,
    inputs=[audio_input, option_dropdown, last_processed_point],
    outputs=[textbox_output, textbox_processed_output, last_processed_point],
    title="Whisper API",
    description="This is a demo of Whisper live transcription with GPT post-processing using the OpenAI API.",
    allow_flagging="never",
    live=True  # the transcribe function is called repeatedly without manual input
)

whisper.launch(debug=True)
