In [None]:
from pyannote.audio import Pipeline
import torch
import json
import soundfile as sf
import os
from pathlib import Path

# Create output directory for audio segments
output_dir = "speaker_segments"
os.makedirs(output_dir, exist_ok=True)

# Get the segments first
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="")

pipeline.to(torch.device("cpu"))

# Load the full audio file
audio, sample_rate = sf.read("audio.wav")

# Get diarization results
diarization = pipeline("audio.wav")

# Convert segments to a simple format and save audio segments
segments = []
for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
    # Calculate start and end samples
    start_sample = int(turn.start * sample_rate)
    end_sample = int(turn.end * sample_rate)
    
    # Extract audio segment
    segment_audio = audio[start_sample:end_sample]
    
    # Create filename for this segment
    segment_filename = f"segment_{i:03d}_{turn.start:.2f}_{turn.end:.2f}.wav"
    segment_path = os.path.join(output_dir, segment_filename)
    
    # Save audio segment
    sf.write(segment_path, segment_audio, sample_rate)
    
    # Add to segments list
    segments.append({
        'start': float(turn.start),
        'end': float(turn.end),
        'speaker': speaker,
        'audio_file': segment_path,
        'segment_id': i,
        'is_overlapping': False
    })

# Save to JSON file
with open('segments.json', 'w') as f:
    json.dump(segments, f, indent=2)

print(f"Segments saved to segments.json")
print(f"Audio segments saved in {output_dir}/")
print("\nExample of how to review and assign speakers:")
print("1. Open segments.json to see all segments")
print("2. Listen to each segment using the audio_file path")
print("3. Edit the 'speaker' field in segments.json to assign the correct speaker")

  std = sequences.std(dim=-1, correction=1)


Segments saved to segments.json
Audio segments saved in speaker_segments/

Example of how to review and assign speakers:
1. Open segments.json to see all segments
2. Listen to each segment using the audio_file path
3. Edit the 'speaker' field in segments.json to assign the correct speaker


In [18]:
import os
import librosa
import soundfile as sf

# Directory containing the audio segments
input_dir = "speaker_segments"
target_sr = 16000

print(f"Starting conversion for files in '{input_dir}'...")

# Iterate through all files in the directory
for filename in os.listdir(input_dir):
    # Check if the file is a WAV file
    if filename.endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        
        try:
            # Load the audio file, preserving the original sample rate
            audio, sr = librosa.load(file_path, sr=None)
            
            # Check if resampling is needed
            if sr != target_sr:
                print(f"Converting '{filename}' from {sr} Hz to {target_sr} Hz...")
                # Resample the audio to 16 kHz
                audio_16k = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr)
                
                # Overwrite the original file with the resampled audio
                sf.write(file_path, audio_16k, target_sr)
                print(f"Successfully converted and saved '{filename}'.")
            else:
                print(f"'{filename}' is already at {target_sr} Hz. Skipping.")
                
        except Exception as e:
            print(f"Error processing '{filename}': {e}")

print(f"\nConversion process finished for folder '{input_dir}'.")

Starting conversion for files in 'speaker_segments'...
Converting 'segment_009_9.67_12.40.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_009_9.67_12.40.wav'.
Converting 'segment_013_17.68_17.70.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_013_17.68_17.70.wav'.
Converting 'segment_014_19.03_19.07.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_014_19.03_19.07.wav'.
Converting 'segment_010_13.06_19.03.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_010_13.06_19.03.wav'.
Converting 'segment_007_6.61_7.84.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_007_6.61_7.84.wav'.
Converting 'segment_012_17.40_17.68.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_012_17.40_17.68.wav'.
Converting 'segment_006_6.36_9.19.wav' from 44100 Hz to 16000 Hz...
Successfully converted and saved 'segment_006_6.36_9.19.wav'.
Converting 'segme

In [None]:
#todo need to make speaker separation programmatically possible for now will just use https://modelscope.cn/studios/iic/ClearerVoice-Studio

In [None]:
"""
now before assigning the labels to speakers i need to make sure that we dont have audio files where two people speak at the same time

so what i need to do is to go to each audio file and check if two people are speaking at the same time

then i need to mark this files again - i need to iterate over all of the files and then make the 

1. generate segments.json
2. iterate over each single audio file and check if overlapping speech and set is_overlapping to true
3. run each file through speech separation model where overlap is true
3. the result needs to be saved in a folder "separated-audio" 
4. then go again to each segment in the file and duplicate for each segment where overlap is true the entry and set speaker and path correctly


"""

In [10]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="")

# send pipeline to CPU instead of GPU
import torch
pipeline.to(torch.device("cpu"))

# apply pretrained pipeline
diarization = pipeline("audio.wav")

# print the result  
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

  std = sequences.std(dim=-1, correction=1)


start=0.4s stop=0.5s speaker_SPEAKER_02
start=0.5s stop=3.0s speaker_SPEAKER_01
start=3.1s stop=3.4s speaker_SPEAKER_02
start=3.9s stop=6.6s speaker_SPEAKER_02
start=6.3s stop=9.2s speaker_SPEAKER_01
start=6.6s stop=7.8s speaker_SPEAKER_03
start=9.6s stop=9.7s speaker_SPEAKER_02
start=9.7s stop=12.4s speaker_SPEAKER_03
start=13.1s stop=19.0s speaker_SPEAKER_03
start=17.4s stop=17.7s speaker_SPEAKER_00
start=19.0s stop=28.0s speaker_SPEAKER_00


In [11]:
import subprocess
from collections import defaultdict
import os

# Specify your output directory
output_dir = "separated_speakers"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Group segments by speaker
speaker_segments = defaultdict(list)
for turn, _, speaker in diarization.itertracks(yield_label=True):
    speaker_segments[speaker].append({
        'start': turn.start,
        'end': turn.end
    })

input_file = "audio.wav"  # Your input audio file

# Process each speaker
for speaker, segments in speaker_segments.items():
    # Create the volume filter expression
    volume_expr = []
    for segment in segments:
        volume_expr.append(f"between(t,{segment['start']},{segment['end']})")

    # Combine all segments with OR operator (+)
    filter_expression = f"volume=enable='{'+'.join(volume_expr)}':volume=1,volume=enable='not({'+'.join(volume_expr)})':volume=0"

    # Create output file path in the specified directory
    output_file = os.path.join(output_dir, f"speaker_{speaker}.wav")

    cmd = [
        "ffmpeg", "-i", input_file,
        "-af", filter_expression,
        "-c:a", "pcm_s16le",  # Use WAV format
        "-vn",  # Remove video stream
        output_file
    ]

    print(f"\nProcessing {speaker}...")
    print(f"Using filter: {filter_expression}")
    try:
        subprocess.run(cmd, check=True)
        print(f"Created file: {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing {speaker}: {e}")

print(f"\nAll speaker files have been created in {output_dir}!")


Processing SPEAKER_02...
Using filter: volume=enable='between(t,0.41909375000000004,0.4697187500000001)+between(t,3.10221875,3.4397187500000004)+between(t,3.87846875,6.61221875)+between(t,9.649718750000002,9.66659375)':volume=1,volume=enable='not(between(t,0.41909375000000004,0.4697187500000001)+between(t,3.10221875,3.4397187500000004)+between(t,3.87846875,6.61221875)+between(t,9.649718750000002,9.66659375))':volume=0
Created file: separated_speakers/speaker_SPEAKER_02.wav

Processing SPEAKER_01...
Using filter: volume=enable='between(t,0.4697187500000001,2.95034375)+between(t,6.257843750000001,9.19409375)':volume=1,volume=enable='not(between(t,0.4697187500000001,2.95034375)+between(t,6.257843750000001,9.19409375))':volume=0


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

Created file: separated_speakers/speaker_SPEAKER_01.wav

Processing SPEAKER_03...
Using filter: volume=enable='between(t,6.61221875,7.844093750000001)+between(t,9.66659375,12.400343750000001)+between(t,13.058468750000003,18.99846875)':volume=1,volume=enable='not(between(t,6.61221875,7.844093750000001)+between(t,9.66659375,12.400343750000001)+between(t,13.058468750000003,18.99846875))':volume=0
Created file: separated_speakers/speaker_SPEAKER_03.wav

Processing SPEAKER_00...
Using filter: volume=enable='between(t,17.36159375,17.69909375)+between(t,18.99846875,27.992843750000002)':volume=1,volume=enable='not(between(t,17.36159375,17.69909375)+between(t,18.99846875,27.992843750000002))':volume=0
Created file: separated_speakers/speaker_SPEAKER_00.wav

All speaker files have been created in separated_speakers!


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex