In [None]:
import json
from pathlib import Path
from pyannote.audio.pipelines import SpeakerDiarization
from lib.diarization import DiarizationPostProcessor
from lib.audio import AudioPreProcessor

class SimpleDiarizer:
    def setup(self):
        """Load the model into memory"""
        #part which can be configured
        self.diarization = SpeakerDiarization(
                segmentation="/data/pyannote/segmentation/pytorch_model.bin",
                embedding="/data/speechbrain/spkrec-ecapa-voxceleb",
                clustering="AgglomerativeClustering",  # You can also try other clustering methods
                segmentation_batch_size=32,
                embedding_batch_size=32,
                embedding_exclude_overlap=False,  # Keep this false for overlapping speech
                segmentation_step=0.05  # Try a smaller step for finer granularity (default is 0.1)
        )
        #part which can be configured
        self.diarization.instantiate({
            "clustering": {
                "method": "centroid",
                "min_cluster_size": 15,
                "threshold": 0.65,
            },
            "segmentation": {
                "min_duration_off": 0.3,
                "threshold": 0.35,
            },
        })
        self.diarization_post = DiarizationPostProcessor()
        self.audio_pre = AudioPreProcessor()

    def run_diarization(self):
        closure = {'embeddings': None}

        def hook(name, *args, **kwargs):
            if name == "embeddings" and len(args) > 0:
                closure['embeddings'] = args[0]

        print('diarizing audio file...')
        #part which can be configured
        diarization = self.diarization(
            self.audio_pre.output_path,
            min_speakers=2,     # Ensure at least 2 speakers are detected
            max_speakers=8,     # Allow up to 8 speakers to be detected
            hook=hook
        )

        # Get configuration from the pipeline's model
        try:
            duration = getattr(self.diarization, 'duration', 2.0)
            step = getattr(self.diarization, 'step', 0.5)
        except AttributeError:
            # Fallback to default values if attributes are not available
            duration = 2.0  # Default duration in seconds
            step = 0.5     # Default step in seconds

        embeddings = {
            'data': closure['embeddings'],
            'chunk_duration': duration,
            'chunk_offset': step * duration
        }

        # Print some debug information
        print(f"Using duration: {duration}s, step: {step}s")
        print(f"Embeddings shape: {closure['embeddings'].shape if closure['embeddings'] is not None else 'None'}")
        
        return self.diarization_post.process(diarization, embeddings)

    def process_audio(self, audio_path: str) -> dict:
        """Process a single audio file"""
        self.audio_pre.process(audio_path)

        if self.audio_pre.error:
            print(self.audio_pre.error)
            result = self.diarization_post.empty_result()
        else:
            result = self.run_diarization()

        self.audio_pre.cleanup()
        return result

# Usage example
if __name__ == "__main__":
    diarizer = SimpleDiarizer()
    diarizer.setup()
    
    # Process an audio file
    audio_path = "output.mp4"
    result = diarizer.process_audio(audio_path)
    
    # Save the results
    with open("output.json", "w") as f:
        json.dump(result, f, indent=2)

: 

In [3]:
import subprocess
from collections import defaultdict
import json

# Load the JSON file
with open('output.json', 'r') as f:
    json_data = json.load(f)

# Group segments by speaker
speaker_segments = defaultdict(list)
for segment in json_data['segments']:
    # Convert time format to seconds
    start_parts = segment['start'].split(':')
    stop_parts = segment['stop'].split(':')
    
    # Convert HH:MM:SS.xxx to seconds
    start = float(start_parts[-1]) + int(start_parts[-2])*60 + int(start_parts[-3])*3600
    stop = float(stop_parts[-1]) + int(stop_parts[-2])*60 + int(stop_parts[-3])*3600

    speaker_segments[segment['speaker']].append({
        'start': start,
        'end': stop
    })

input_file = "output.mp4"

# Define speaker names (you can customize these)
speaker_names = {
    'A': 'first_speaker_v3',
    'B': 'second_speaker_v3',
    'C': 'third_speaker_v3',
    'D': 'fourth_speaker_v3',
    'E': 'fifth_speaker_v3',
    'F': 'sixth_speaker_v3'
}

for speaker, segments in speaker_segments.items():
    # Create the volume filter expression
    volume_expr = []
    for segment in segments:
        volume_expr.append(f"between(t,{segment['start']},{segment['end']})")

    # Combine all segments with OR operator (+)
    filter_expression = f"volume=enable='{'+'.join(volume_expr)}':volume=1,volume=enable='not({'+'.join(volume_expr)})':volume=0"

    # Create a more descriptive output filename
    speaker_name = speaker_names.get(speaker, f"unknown_speaker_{speaker}")
    output_file = f"{speaker_name}_speaker_{speaker}_output.mp4"

    cmd = [
        "ffmpeg", "-i", input_file,
        "-af", filter_expression,
        "-c:a", "aac",
        "-vn",  # Remove video stream
        output_file
    ]

    print(f"\nProcessing Speaker {speaker} ({speaker_name})...")
    print(f"Using filter: {filter_expression}")
    try:
        subprocess.run(cmd, check=True)
        print(f"Created file: {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing Speaker {speaker} ({speaker_name}): {e}")

print("\nAll speaker files have been created!")


Processing Speaker A (first_speaker_v3)...
Using filter: volume=enable='between(t,0.030969,2.832219)+between(t,22.474719,28.094094)':volume=1,volume=enable='not(between(t,0.030969,2.832219)+between(t,22.474719,28.094094))':volume=0


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Created file: first_speaker_v3_speaker_A_output.mp4

Processing Speaker B (second_speaker_v3)...
Using filter: volume=enable='between(t,3.085344,4.806594)':volume=1,volume=enable='not(between(t,3.085344,4.806594))':volume=0


Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf61.7.100
  Duration: 00:00:28.03, start: 0.000000, bitrate: 2488 kb/s
  Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1920x1080 [SAR 1:1 DAR 16:9], 2353 kb/s, 23.98 fps, 23.98 tbr, 24k tbn, 47.95 tbc (default)
    Metadata:
      handler_name    : VideoHandler
      vendor_id       : [0][0][0][0]
      encoder         : Lavc61.19.100 libx264
  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)
    Metadata:
      handler_name    : SoundHandler
      vendor_id       : [0][0][0][0]
Stream mapping:
  Stream #0:1 -> #0:0 (aac (native) -> aac (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'second_speaker_v3_speaker_B_output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: i

Created file: second_speaker_v3_speaker_B_output.mp4

Processing Speaker C (third_speaker_v3)...
Using filter: volume=enable='between(t,4.806594,9.345969)':volume=1,volume=enable='not(between(t,4.806594,9.345969))':volume=0


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Created file: third_speaker_v3_speaker_C_output.mp4

Processing Speaker D (fourth_speaker_v3)...
Using filter: volume=enable='between(t,9.345969,12.974094)':volume=1,volume=enable='not(between(t,9.345969,12.974094))':volume=0


Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf61.7.100
  Duration: 00:00:28.03, start: 0.000000, bitrate: 2488 kb/s
  Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1920x1080 [SAR 1:1 DAR 16:9], 2353 kb/s, 23.98 fps, 23.98 tbr, 24k tbn, 47.95 tbc (default)
    Metadata:
      handler_name    : VideoHandler
      vendor_id       : [0][0][0][0]
      encoder         : Lavc61.19.100 libx264
  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)
    Metadata:
      handler_name    : SoundHandler
      vendor_id       : [0][0][0][0]
Stream mapping:
  Stream #0:1 -> #0:0 (aac (native) -> aac (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'fourth_speaker_v3_speaker_D_output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: i

Created file: fourth_speaker_v3_speaker_D_output.mp4

Processing Speaker E (fifth_speaker_v3)...
Using filter: volume=enable='between(t,12.974094,17.530344)':volume=1,volume=enable='not(between(t,12.974094,17.530344))':volume=0


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Created file: fifth_speaker_v3_speaker_E_output.mp4

Processing Speaker F (sixth_speaker_v3)...
Using filter: volume=enable='between(t,17.530344,22.474719)':volume=1,volume=enable='not(between(t,17.530344,22.474719))':volume=0


Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf61.7.100
  Duration: 00:00:28.03, start: 0.000000, bitrate: 2488 kb/s
  Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1920x1080 [SAR 1:1 DAR 16:9], 2353 kb/s, 23.98 fps, 23.98 tbr, 24k tbn, 47.95 tbc (default)
    Metadata:
      handler_name    : VideoHandler
      vendor_id       : [0][0][0][0]
      encoder         : Lavc61.19.100 libx264
  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)
    Metadata:
      handler_name    : SoundHandler
      vendor_id       : [0][0][0][0]
Stream mapping:
  Stream #0:1 -> #0:0 (aac (native) -> aac (native))
Press [q] to stop, [?] for help
Output #0, mp4, to 'sixth_speaker_v3_speaker_F_output.mp4':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: is

Created file: sixth_speaker_v3_speaker_F_output.mp4

All speaker files have been created!


size=      92kB time=00:00:27.98 bitrate=  27.1kbits/s speed= 141x    
video:0kB audio:87kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 6.347725%
[aac @ 0x5ecb3c954880] Qavg: 53967.988


In [None]:
#ok how can we make the results better now so that the two speaker are not getting detected as one speaker