In [1]:
import os

# Use HF_TOKEN from environment (e.g. export HF_TOKEN=your_token). Never commit tokens.
HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN", "")

In [2]:
# Extract audio from video file
from pathlib import Path
from debate_analyzer.transcriber import AudioExtractor

video_path = Path("/Users/tjirsik/Repository/debate_analyzer/data/videos/oVmunv8iUKI_Panelová diskuze： Výuka ČDJ v praxi  a systémové zajištění jazykové přípravy (krátká verze).mp4")
extractor = AudioExtractor(sample_rate=16000, channels=1)

# Extract audio to a temporary file
audio_path = extractor.extract_audio(video_path)
print(f"Audio extracted to: {audio_path}")

  from .autonotebook import tqdm as notebook_tqdm
torchcodec is not installed correctly so built-in audio decoding will fail. Solutions are:
* use audio preloaded in-memory as a {'waveform': (channel, time) torch.Tensor, 'sample_rate': int} dictionary;
* fix torchcodec installation. Error message was:

Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6 and 7.
          2. The PyTorch version (2.8.0) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 7: dlopen(/Users/tjirsik/Repository/debate_analyzer/.venv/lib/python3.12/

Audio extracted to: /var/folders/jy/t1q3dvcx5yz9lzcnh7drbpzm0000gn/T/debate_audio_0vzsvher.wav


In [3]:
# Run speaker diarization using pyannote
from pyannote.audio import Pipeline
import torch
import soundfile as sf

# Load audio into memory (workaround for torchcodec issues)
print(f"Loading audio from: {audio_path}")
waveform, sample_rate = sf.read(str(audio_path))

# Convert to torch tensor and ensure correct shape (channels, samples)
if waveform.ndim == 1:
    # Mono audio - add channel dimension
    waveform = waveform[None, :]
else:
    # Stereo or multi-channel - transpose to (channels, samples)
    waveform = waveform.T

waveform_tensor = torch.from_numpy(waveform).float()

# Create the audio dictionary format that pyannote expects
audio_dict = {
    "waveform": waveform_tensor,
    "sample_rate": sample_rate
}

print(f"Audio loaded: {waveform_tensor.shape[1]/sample_rate:.1f} seconds, {sample_rate}Hz")

# Use the token from the first cell (updated parameter name)
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    token=HUGGINGFACE_TOKEN
)

# Run the pipeline on the preloaded audio
print("Running speaker diarization...")
diarization = pipeline(audio_dict)

# Display the results (pyannote.audio 4.0 API)
print("\nSpeaker Diarization Results:")
print("=" * 50)

# In version 4.0, DiarizeOutput has speaker_diarization attribute
# which contains the Annotation object
if hasattr(diarization, 'speaker_diarization'):
    annotation = diarization.speaker_diarization
    
    # Iterate through the annotation to get speaker segments
    for turn, _, speaker in annotation.itertracks(yield_label=True):
        print(f"Speaker {speaker}: {turn.start:.1f}s - {turn.end:.1f}s")
    
    # Save to RTTM format
    output_rttm = audio_path.parent / "diarization_output.rttm"
    with open(output_rttm, "w") as rttm:
        annotation.write_rttm(rttm)
    print(f"\nDiarization results saved to: {output_rttm}")
else:
    print("Unexpected diarization format")
    print(diarization)

Loading audio from: /var/folders/jy/t1q3dvcx5yz9lzcnh7drbpzm0000gn/T/debate_audio_0vzsvher.wav
Audio loaded: 311.8 seconds, 16000Hz


  available_backends = torchaudio.list_audio_backends()


Running speaker diarization...


  std = sequences.std(dim=-1, correction=1)



Speaker Diarization Results:
Type: <class 'pyannote.audio.pipelines.speaker_diarization.DiarizeOutput'>
Available methods: ['exclusive_speaker_diarization', 'serialize', 'speaker_diarization', 'speaker_embeddings']

Diarization output:
DiarizeOutput(speaker_diarization=<pyannote.core.annotation.Annotation object at 0x312a04b30>, exclusive_speaker_diarization=<pyannote.core.annotation.Annotation object at 0x3121b6900>, speaker_embeddings=array([[-1.18493713e-01, -7.93337822e-02,  2.23689243e-01,
        -2.75747895e-01, -1.64491206e-01,  3.41394484e-01,
         6.06364459e-02, -1.59959316e-01, -3.12254936e-01,
        -3.35725814e-01,  5.99350519e-02, -1.15796290e-01,
        -5.38994512e-03, -3.68898273e-01, -1.22059092e-01,
         7.95321481e-04,  2.12684553e-02,  3.84198874e-01,
         9.97848213e-02, -2.94653270e-02, -2.23885641e-01,
         2.32218411e-02,  6.14525005e-02,  1.61823705e-01,
         5.61357811e-02,  4.24667001e-01, -1.51001230e-01,
         3.13226460e-03, -3