<a href="https://colab.research.google.com/github/thias42/offline-speech-summarization/blob/main/whisper_diarization_summarization_offline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/thias42/offline-speech-summarization/refs/heads/main/requirements.txt
!pip install -r requirements.txt

In [None]:
import llm
import torch
import typer
import logging
import whisper
from os import environ
from dotenv import load_dotenv
from pyannote.audio import Pipeline

load_dotenv()

# Load Whisper model
whisper_model = whisper.load_model("base")

# Load Pyannote.audio pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                                use_auth_token=environ.get("HF_AUTH_TOKEN"))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
diarization_pipeline.to(torch.device(device)) # switch to gpu

llm_model = llm.get_model(environ.get("LLM_MODEL"))

def transcribe_audio(audio_path):
    """Transcribe audio file to text using Whisper"""
    result = whisper_model.transcribe(audio_path)
    return result

def diarize_audio(audio_path):
    """Perform speaker diarization using pyannote.audio"""
    diarization = diarization_pipeline(audio_path)
    return diarization

def merge_transcription_and_diarization(transcription, diarization, margin=0.2):
    """Merge Whisper transcription with pyannote.audio diarization"""
    merged_output = []
    for segment, _, speaker in diarization.itertracks(yield_label=True):
        segment_start = segment.start
        segment_end = segment.end

        # Find all words that fall within this segment
        segment_words = [word for word in transcription["segments"]
                         if word["start"] >= (segment_start - margin) and word["end"] <= (segment_end + margin)]

        if segment_words:
            segment_text = " ".join([word["text"] for word in segment_words])
            merged_output.append(f"Speaker {speaker}: {segment_text}")

    return "\n".join(merged_output)

def generate_summary_with_chatgpt(text):
    """Generate summary using LLM"""
    response = llm_model.prompt(
        text,
        system="You are a helpful assistant, who creates a summary of a given conversation. Capture the essence and summarize in bullet points."
    )
    return response.text()

In [2]:
audio_path = 'audio_recording.wav'
transcription = transcribe_audio(audio_path)

In [5]:
diarization = diarize_audio(audio_path)

In [6]:
full_text = merge_transcription_and_diarization(transcription, diarization)

In [None]:
generate_summary_with_chatgpt(full_text)