In [None]:
pip install pyannote.audio torch soundfile requests huggingface_hub



In [None]:
!pip install -U git+https://github.com/m-bain/whisperX.git
!pip install requests pandas soundfile openai-whisper

Collecting git+https://github.com/m-bain/whisperX.git
  Cloning https://github.com/m-bain/whisperX.git to /tmp/pip-req-build-fzccggwi
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperX.git /tmp/pip-req-build-fzccggwi
  Resolved https://github.com/m-bain/whisperX.git to commit f10dbf6ab1717e84db7733df9c0b21658ee68f9b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ctranslate2>=4.5.0 (from whisperx==3.3.1)
  Downloading ctranslate2-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting faster-whisper>=1.1.1 (from whisperx==3.3.1)
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting onnxruntime==1.19 (from whisperx==3.3.1)
  Downloading onnxruntime-1.19.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting pandas>=2.2.

In [None]:
import requests
import os
import torch
import whisperx # WhisperX for ASR and alignment
import pandas as pd # For handling diarization results for WhisperX
from pyannote.audio import Pipeline
from huggingface_hub import HfFolder

# --- 1. Configuration ---

# !! IMPORTANT: Replace "YOUR_HUGGINGFACE_TOKEN" with your actual token !!
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"

if HF_TOKEN == "YOUR_HUGGINGFACE_TOKEN":
    print("WARNING: Placeholder Hugging Face token detected. Pipeline loading might fail.")
    print("Please replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token.")
    # Consider raising an error or exiting if a valid token is strictly required.
    # raise ValueError("Hugging Face token not set.")

# Audio source
AUDIO_URL = "https://github.com/pyannote/pyannote-audio/raw/develop/tutorials/assets/sample.wav"
AUDIO_FILENAME = "sample_audio.wav"

# Pyannote Diarization Pipeline
DIARIZATION_PIPELINE_NAME = "pyannote/speaker-diarization-3.1"

# WhisperX Configuration
WHISPER_MODEL_SIZE = "medium" # e.g., "tiny", "base", "small", "medium", "large-v2"
BATCH_SIZE = 16 # Reduce if low memory, affects transcription speed
COMPUTE_TYPE = "float16" # "float16", "float32", "int8" - use float16 if GPU supports it for speed/memory

# --- 2. Setup Device ---

# Check for GPU availability
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"Compute Type: {COMPUTE_TYPE}")
    # Check if float16 is supported
    if torch.cuda.get_device_capability(0)[0] < 7:
         print("Warning: GPU does not support float16, using float32 instead.")
         COMPUTE_TYPE = "float32"
else:
    device = "cpu"
    COMPUTE_TYPE = "float32" # CPU only supports float32
    print("Using CPU. Transcription will be significantly slower.")
    print(f"Compute Type: {COMPUTE_TYPE}")


# --- 3. Download Audio File ---

print(f"\n--- Downloading Audio ---")
if not os.path.exists(AUDIO_FILENAME):
    print(f"Attempting to download audio file from: {AUDIO_URL}")
    try:
        response = requests.get(AUDIO_URL, stream=True, timeout=30)
        response.raise_for_status()
        with open(AUDIO_FILENAME, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Audio file downloaded successfully: {AUDIO_FILENAME}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading audio file: {e}")
        exit()
else:
    print(f"Audio file already exists: {AUDIO_FILENAME}")

# --- 4. Speaker Diarization (Pyannote) ---

print(f"\n--- Running Speaker Diarization ---")
diarization_pipeline = None
diarization_result = None
try:
    print(f"Loading diarization pipeline: {DIARIZATION_PIPELINE_NAME}...")
    diarization_pipeline = Pipeline.from_pretrained(
        DIARIZATION_PIPELINE_NAME,
        use_auth_token=HF_TOKEN # Use the token here
    )
    diarization_pipeline.to(torch.device(device)) # Move pipeline to selected device
    print("Diarization pipeline loaded.")

    print(f"Applying diarization pipeline to {AUDIO_FILENAME}...")
    diarization_result = diarization_pipeline(AUDIO_FILENAME)
    print("Diarization complete.")

    # Convert pyannote annotation to format expected by WhisperX if needed later
    # (We might not need WhisperX's speaker assignment if we map manually)
    # Example conversion (adjust based on WhisperX requirements if using its assignment):
    # diarization_segments = []
    # for turn, _, speaker in diarization_result.itertracks(yield_label=True):
    #     diarization_segments.append({
    #         'start': turn.start,
    #         'end': turn.end,
    #         'speaker': speaker,
    #         'label': 'SPEAKER' # Generic label, specific speaker is in 'speaker'
    #     })
    # diarization_df = pd.DataFrame(diarization_segments)
    # print("\nPyannote Diarization Segments:")
    # print(diarization_df)


except Exception as e:
    print(f"Error during speaker diarization: {e}")
    print("Please ensure:")
    print("1. You have replaced 'YOUR_HUGGINGFACE_TOKEN' with a valid token.")
    print(f"2. You have accepted the user agreement on the model page: https://huggingface.co/{DIARIZATION_PIPELINE_NAME}")
    print("3. Your internet connection is stable and dependencies are correct.")
    # Exit if diarization fails, as transcription depends on it
    exit()

# --- 5. Speech Recognition & Alignment (WhisperX) ---

print(f"\n--- Running Speech Recognition & Alignment ---")
asr_result_aligned = None
try:
    # 1. Load WhisperX model
    print(f"Loading WhisperX ASR model: {WHISPER_MODEL_SIZE}...")
    # Note: Model files are downloaded on first run for each size
    asr_model = whisperx.load_model(WHISPER_MODEL_SIZE, device, compute_type=COMPUTE_TYPE)
    print("WhisperX ASR model loaded.")

    # 2. Load audio using WhisperX loader
    print(f"Loading audio with WhisperX: {AUDIO_FILENAME}...")
    audio = whisperx.load_audio(AUDIO_FILENAME)
    print("Audio loaded.")

    # 3. Transcribe audio
    print("Transcribing audio...")
    result_asr = asr_model.transcribe(audio, batch_size=BATCH_SIZE)
    print(f"Transcription complete. Detected language: {result_asr['language']}")

    # 4. Align transcriptions
    print("Aligning transcriptions...")
    # Load alignment model for the detected language
    model_a, metadata = whisperx.load_align_model(language_code=result_asr["language"], device=device)
    # Align whisper output
    asr_result_aligned = whisperx.align(result_asr["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    print("Alignment complete.")
    # Result contains segments with word-level timestamps, e.g.:
    # {'segments': [{'start': 0.01, 'end': 2.55, 'text': ' Hello there.', 'words': [{'word': 'Hello', 'start': 0.01, 'end': 0.51, 'score': 0.9}, ...]}, ...]}

except Exception as e:
    print(f"Error during WhisperX processing: {e}")
    print("Please ensure WhisperX and its dependencies (like ffmpeg, torch with CUDA) are correctly installed.")
    # Exit if ASR fails
    exit()


# --- 6. Combine Diarization and Transcription ---

print(f"\n--- Generating Final Transcript ---")

if diarization_result is None or asr_result_aligned is None:
    print("Error: Cannot generate final transcript due to previous errors.")
    exit()

# Extract word segments with timestamps from WhisperX result
word_segments = []
for segment in asr_result_aligned["segments"]:
    if 'words' in segment: # Check if word timings exist
        word_segments.extend(segment['words'])
    else:
        # Handle segments without word timings (less common with alignment)
        # Create a pseudo-word segment for the whole text? Or skip?
        # For simplicity, let's just note it and potentially skip.
        print(f"Warning: Segment '{segment['text']}' lacks word timestamps, skipping for detailed mapping.")


# Sort pyannote turns by start time
sorted_turns = sorted(diarization_result.itertracks(yield_label=True), key=lambda x: x[0].start)

# Iterate through sorted speaker turns from Pyannote
print("\nFinal Transcript:")
for turn, _, speaker in sorted_turns:
    turn_start = turn.start
    turn_end = turn.end

    # Find words from WhisperX that fall within this speaker turn
    # We check if the middle of the word's time interval falls within the turn
    turn_words = []
    for word_info in word_segments:
        if 'start' in word_info and 'end' in word_info: # Ensure word has timestamps
             word_mid_time = word_info['start'] + (word_info['end'] - word_info['start']) / 2
             if turn_start <= word_mid_time < turn_end:
                 turn_words.append(word_info['word'])

    # Join words and print
    if turn_words: # Only print if there are words associated with the turn
        transcript_segment = " ".join(turn_words).strip()
        # Optional: Clean up potential leading/trailing punctuation from Whisper if needed
        print(f"{speaker} ({turn_start:.2f}s - {turn_end:.2f}s): {transcript_segment}")
    # else:
    #     # Optionally print turns with no detected words (e.g., short pauses, non-speech)
    #     print(f"{speaker} ({turn_start:.2f}s - {turn_end:.2f}s): [No speech detected in segment]")


# Optional: Clean up downloaded file
# try:
#     os.remove(AUDIO_FILENAME)
#     print(f"\nCleaned up downloaded file: {AUDIO_FILENAME}")
# except OSError as e:
#     print(f"Error removing file {AUDIO_FILENAME}: {e}")

print("\n--- Finished ---")

Using CPU. Transcription will be significantly slower.
Compute Type: float32

--- Downloading Audio ---
Audio file already exists: sample_audio.wav

--- Running Speaker Diarization ---
Loading diarization pipeline: pyannote/speaker-diarization-3.1...
Diarization pipeline loaded.
Applying diarization pipeline to sample_audio.wav...


  std = sequences.std(dim=-1, correction=1)


Diarization complete.

--- Running Speech Recognition & Alignment ---
Loading WhisperX ASR model: medium...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../usr/local/lib/python3.11/dist-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.
WhisperX ASR model loaded.
Loading audio with WhisperX: sample_audio.wav...
Audio loaded.
Transcribing audio...
Detected language: en (1.00) in first 30s of audio...
Transcription complete. Detected language: en
Aligning transcriptions...


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:06<00:00, 55.5MB/s]


Alignment complete.

--- Generating Final Transcript ---

Final Transcript:
SPEAKER_01 (6.73s - 7.17s): Hello?
SPEAKER_01 (7.59s - 8.32s): Hello.
SPEAKER_02 (8.32s - 9.92s): Oh, hello. I didn't know you were there.
SPEAKER_01 (9.92s - 10.93s): Neither did
SPEAKER_02 (10.46s - 14.75s): did I. OK, I thought, you know, I heard a beep. This is Diane in New Jersey. And I'm
SPEAKER_00 (10.93s - 10.98s): I.
SPEAKER_00 (14.31s - 17.88s): And I'm Sheila in Texas, originally from Chicago. Oh,
SPEAKER_02 (18.02s - 21.51s): I'm originally from Chicago also. I'm in New Jersey now, though.
SPEAKER_00 (21.77s - 28.50s): Well, there isn't that much difference. At least, you know, they all call me a Yankee down here, so what can I say? Oh,
SPEAKER_02 (27.86s - 29.97s): can I say? Oh, I don't hear that in New Jersey now.

--- Finished ---
