# Installing Dependencies

In [None]:
!pip install pyAudioAnalysis
!pip install hmmlearn
!pip install eyed3
!pip install pydub
!pip install simplejson

Collecting hmmlearn
  Downloading hmmlearn-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.1/161.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.3.2


# Speech Diarization using pyAudioAnalysis

### Imports

In [None]:
from pyAudioAnalysis import audioSegmentation as aS
import matplotlib.pyplot as plt
import numpy as np
import wave
import json
import bisect


### Utility Functions

In [None]:
def segments_to_json(speaker_array, output_file, segment_duration=0.1):
    segments = []
    current_speaker = speaker_array[0]
    start_time = 0
    end_time = 0

    for i, speaker in enumerate(speaker_array):
        end_time += segment_duration
        if speaker != current_speaker:
            segments.append({
                "start": round(start_time, 1),
                "end": round(end_time, 1),
                "speaker": int(current_speaker)
            })
            start_time = end_time
            current_speaker = speaker

    # Append the last segment
    end_time = len(speaker_array) * segment_duration
    segments.append({
        "start": round(start_time, 1),
        "end": round(end_time, 1),
        "speaker": int(current_speaker)
    })

    with open(output_file, 'w') as f:
        json.dump(segments, f, indent=4)

    return segments

def load_segments(json_file):
  with open(json_file, 'r') as f:
      segments = json.load(f)
  return segments

def segments_to_intervals(segments, duration=0.1):
    intervals = []
    for segment in segments:
        start = segment['start']
        end = segment['end']
        speaker = segment['speaker']
        while start < end:
            intervals.append((start, speaker))
            start += duration
    return intervals

def get_nearest_smaller_or_equal_time(times, time):
    pos = bisect.bisect_right(times, time)
    if pos == 0:
        return None  # No smaller or equal time exists
    return times[pos - 1]

def align_intervals(ref_intervals, hyp_intervals):
    ref_times = sorted([t for t, _ in ref_intervals])
    hyp_times = sorted([t for t, _ in hyp_intervals])
    ref_dict = {t: s for t, s in ref_intervals}
    hyp_dict = {t: s for t, s in hyp_intervals}

    all_times = sorted(set(ref_times + hyp_times))

    aligned_ref = []
    aligned_hyp = []

    for time in all_times:
        nearest_ref_time = get_nearest_smaller_or_equal_time(ref_times, time)
        nearest_hyp_time = get_nearest_smaller_or_equal_time(hyp_times, time)

        aligned_ref.append(ref_dict.get(nearest_ref_time, 'none'))
        aligned_hyp.append(hyp_dict.get(nearest_hyp_time, 'none'))

    print(aligned_ref)
    print(aligned_hyp)

    matches = 0
    for i in range(len(aligned_ref)):
      if aligned_ref[i] == aligned_hyp[i]:
        matches += 1
    print(matches/len(aligned_ref))

    return np.array(aligned_ref), np.array(aligned_hyp)

def calculate_der(ref_file, hyp_file, duration=0.1):
    ref_segments = load_segments(ref_file)
    hyp_segments = load_segments(hyp_file)

    ref_segments[0]["start"] = 0

    ref_intervals = segments_to_intervals(ref_segments, duration)
    hyp_intervals = segments_to_intervals(hyp_segments, duration)

    ref_labels, hyp_labels = align_intervals(ref_intervals, hyp_intervals)

    speaker_errors = np.sum(ref_labels != hyp_labels)
    total_intervals = len(ref_labels)

    der = speaker_errors / total_intervals
    return der

### Driver Code

In [None]:
wav_file = 'audio_sample_20.wav'
ref_file = 'audio_sample_20.json'
hyp_file = 'diarization.json'
segments, _, _ = aS.speaker_diarization(wav_file, n_speakers=2)
segments_to_json(segments, hyp_file)
der = calculate_der(ref_file, hyp_file)

print(f'Diarization Error Rate (DER): {der:.2%}')

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 