# Meeting Assistant using Whisper & Llama3.2

# Initial Setup

In [1]:
import os
import shutil
import requests
import torch
import transformers
import gradio as gr
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import login
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from transformers import AutoTokenizer, TextStreamer
from IPython.display import Markdown, display, update_display

In [2]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Audio file
ABSOLUTE_PATH = os.path.abspath(os.getcwd())
AUDIO_DIR = "audio"
RECORDING_NAME = "user_audio.mp3"

RECORDING_PATH = os.path.join(ABSOLUTE_PATH, AUDIO_DIR)
RECORDING_FILE = os.path.join(RECORDING_PATH, RECORDING_NAME)

In [4]:
load_dotenv(override=True)

# Get OpenAI creds
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "type-your-api-key-here")
openai_client = OpenAI()

# Login Hugging Face
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "type-your-token-here")
login(HUGGINGFACE_TOKEN)

In [5]:
# Models
AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [6]:
# Load Llama model with quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

llama_model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",
    quantization_config=quantization_config
)

# Load Llama tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA)
llama_tokenizer.pad_token = llama_tokenizer.eos_token

streamer = TextStreamer(llama_tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Speech Recognition and Transcription

In [7]:
def transcribe_audio() -> str:
    # Load recording file
    recording_file = open(RECORDING_FILE, "rb")
    
    # Speech Recognition and Transcription
    transcription = openai_client.audio.transcriptions.create(
        model=AUDIO_MODEL,
        file=recording_file,
        response_format="text"
    )

    return transcription

# Testing
# transcribe_audio()

# Prompting

In [8]:
system_prompt = "You are a meeting assistant that generates concise, \
well-structured meeting minutes in markdown from transcripts. \
Include: summary, key discussion points, takeaways, and action items with owners."

user_prompt = f"Given the transcript below, generate meeting minutes in markdown with: \
summary, key discussion points, takeaways, and action items with owners. \
Transcript:\n"

def generate_messages(
    transcription: str,
    user_prompt: str=user_prompt,
    system_prompt: str=system_prompt
) -> list:
    user_prompt += transcription
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

In [9]:
def extract_text(decoded: str) -> str:
    """Clean special tokens from the generated text"""
    if "<|start_header_id|>assistant<|end_header_id|>" in decoded:
        assistant_part = decoded.split("<|start_header_id|>assistant<|end_header_id|>")[-1] # get assistant part only
        assistant_reply = assistant_part.split("<|eot_id|>")[0].strip()
    else:
        assistant_reply = decoded.strip()  # fallback, if format changes
    return assistant_reply

def generate_mom(transcription: str):
    # Compile messages
    messages = generate_messages(transcription)

    # Tokenize
    inputs = llama_tokenizer.apply_chat_template(
        messages,
        return_tensors="pt"
    ).to(device)

    # Inference
    outputs = llama_model.generate(
        inputs,
        max_new_tokens=2000
    )
    
    # Extract response
    decoded = llama_tokenizer.decode(outputs[0])
    response = extract_text(decoded)
    
    return response

In [10]:
# Testing
# mom = generate_mom(transcription)

In [11]:
def save_audio(file_path):
    if not file_path or not os.path.exists(file_path):
        raise ValueError("No audio file uploaded or file does not exist.")
        
    try:
        shutil.move(file_path, RECORDING_FILE)
    except Exception as e:
        raise RuntimeError(f"Failed to save audio: {str(e)}")
        
def process(file_path):
    try:
        # Save uploaded audio file
        save_audio(file_path)

        # Transcribe audio
        transcription = transcribe_audio()
        
        # Generate Minutes of Meeting
        mom = generate_mom(transcription)
        return mom

    except Exception as e:
        return f"**Error:** {str(e)}"

In [15]:
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("# Your Meeting Assistant")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath")
            submit_btn = gr.Button("Generate MoM")
        output_md = gr.Markdown()
        
    submit_btn.click(
        fn=process,
        inputs=audio_input,
        outputs=output_md
    )
    
demo.launch()

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


