Converting speech to text through automatic speech recognition pipeline

In [1]:
# installs
!pip install -q transformers torch safetensors accelerate bitsandbytes
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline,AutoModelForCausalLM,AutoTokenizer,TextStreamer,BitsAndBytesConfig

In [2]:
print(torch.cuda.is_available())

True


In [3]:
AUDIO_MODEL = "openai/whisper-medium"
speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
speech_model.to('cuda')
processor = AutoProcessor.from_pretrained(AUDIO_MODEL)

pipe = pipeline(
    "automatic-speech-recognition",
    model=speech_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device='cuda',
)

Device set to use cuda


Summarising the text through a second pipeline

In [4]:
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [5]:
model="meta-llama/Meta-Llama-3.1-8B-Instruct"

In [6]:
# Use a pipeline as a high-level helper
# from transformers import pipeline

# messages = [
#     {"role": "user", "content": "Who are you?"},
# ]
# pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1", trust_remote_code=True)
# pipe(messages)

In [7]:
def messages_for(audio_title,transcription):
  messages = [
      {"role": "system", "content": "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."},
      {"role": "user", "content": f"Below is an extract transcript of a {audio_title}. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcription}"},
  ]
  return messages

In [8]:
def quant_config():
  return BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

In [9]:
def generate(model, messages):
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config(), trust_remote_code=True)
    outputs = model.generate(inputs, max_new_tokens=80)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
import re

In [11]:
def process_audio(audio_title="",audio_file=None):
    """Transcribes audio and summarizes it into meeting minutes."""

    # Step 1: Transcribe the audio
    transcription = pipe(audio_file,return_timestamps=True)["text"]

    # Step 2: Create messages list, quantize, generate summary
    messages = messages_for(audio_title,transcription)
    summary = generate(model,messages)

    # Fixing o/p format
    match = re.search(r"minutes of the", summary, re.IGNORECASE)
    if match:
        summary = summary[match.start():]

    return transcription, summary

In [12]:
! pip install -q gradio
import gradio as gr

In [13]:
force_dark_mode = """
function refresh() {
    const url = new URL(window.location);
    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

In [15]:
# Define Gradio Interface
interface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Textbox(label="Audio Title"),  # New input for audio title
        gr.Audio(type="filepath")  # Accepts an audio file
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Markdown()
    ],
    title="AI Meeting Minutes Generator",
    description="Upload an audio file or enable microphone, and this tool will convert speech to text and summarize it into meeting minutes.",
    js=force_dark_mode,
    flagging_mode="never"
)

# Launch the Interface
interface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ec71eb0cd52f2cc41f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2103, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1650, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
           ^^^^^^^^^^

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ec71eb0cd52f2cc41f.gradio.live


