## PIP Install (Only Run if not installed)

In [1]:
# %pip install editdistance
# %pip install datasets
# %pip install lightning
# %pip install webdataset
# %pip install jiwer
# %pip install einops
# %pip install lhotse
# %pip install transformers
# %pip install sentencepiece
# %pip install librosa
# %pip install pyannote.audio
# %pip install braceexpand

## Load Libreries

In [1]:
import numpy as np
import pandas as pd
import torch
import soundfile as sf
import nemo 

import soundfile as sf
from tqdm import tqdm
from IPython.display import Audio
from pyannote.core import Annotation, Segment
from pyannote.metrics.diarization import DiarizationErrorRate
import re




print("NumPy version:", np.__version__)
print("Torch version:", torch.__version__)
print("NeMo version:", nemo.__version__)


NumPy version: 1.26.3
Torch version: 2.3.1+cu118
NeMo version: 2.3.0


## Read Input CSV File

In [2]:
# # Read csv file
df = pd.read_csv('/home/kelechi/Dialect-Classification/Diarization results/assemblyai_diarization_der_0.1272_30.csv')

input_csv = pd.read_csv('/home/kelechi/Dialect-Classification/data/dir_dataset/afrispeech_dialog_v1_47.csv')

# Modify 'path' column to include the full path to remove data/ and change to data/dir_dataset/
input_csv['path'] = input_csv['path'].apply(lambda x: x.replace('data/', 'data/dir_dataset/'))


#Add a column 'keep' if audio_id in df['audio_id'] exists in input_csv['audio_id']
input_csv['keep'] = input_csv['audio_id'].isin(df['audio_id'])

# Select only rows where 'keep' is True
input_csv = input_csv[input_csv['keep']]


# # select only row index 0 and 16, 17 18 and 19
# input_csv = input_csv.iloc[[0, 1, 16, 17, 18, 19]]



## Load dirarization model

In [3]:
from nemo.collections.asr.models import SortformerEncLabelModel

# load model from Hugging Face model card directly (You need a Hugging Face token)
diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_sortformer_4spk-v1")


# switch to inference mode
diar_model.eval()

# Clear CUDA cache
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm
[NeMo W 2025-05-27 16:01:59 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    num_spks: 4
    session_len_sec: 90
    soft_label_thres: 0.5
    soft_targets: false
    labels: null
    batch_size: 4
    shuffle: true
    num_workers: 18
    validation_mode: false
    use_lhotse: false
    use_bucketing: false
    num_buckets: 10
    bucket_duration_bins:
    - 10
    - 20
    - 30
    - 40
    - 50
    - 60
    - 70
    - 80
    - 90
    pin_memory: true
    min_duration: 80
    max_duration: 90
    batch_duration: 400
    quadratic_duration: 1200
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    window_stride: 0.01
    subsampling_factor: 8
    
[NeMo W 2025-05-27 16:01:59 modelPT:187] If you intend to do validation, 

[NeMo I 2025-05-27 16:01:59 features:305] PADDING: 16
[NeMo I 2025-05-27 16:02:00 save_restore_connector:275] Model SortformerEncLabelModel was successfully restored from /home/kelechi/.cache/huggingface/hub/models--nvidia--diar_sortformer_4spk-v1/snapshots/4cb5954e59a1a6527e6ec061a0568b61efa8babd/diar_sortformer_4spk-v1.nemo.


## Load Audio Files for Processing

In [4]:
# Loop through all rows in input_csv and read each audio file
audio_signals = []
sample_rates = []

for path in tqdm(input_csv['path'], desc="Reading audio files"):
    audio_signal, sample_rate = sf.read(path)
    audio_signals.append(audio_signal)
    sample_rates.append(sample_rate)


Reading audio files: 100%|██████████| 30/30 [00:03<00:00,  9.26it/s]


## View/Play Audio

In [None]:
for path in input_csv['path_']:
    display(Audio(path))


## Process Audio as Mono Audio File

In [5]:
# Loop through each audio signal in audio_signals and save as mono WAV
for i, audio_signal in enumerate(audio_signals):
    # Convert to mono if stereo
    if len(audio_signal.shape) == 2:
        mono_audio = np.mean(audio_signal, axis=1)
    else:
        mono_audio = audio_signal
    # Save as mono WAV with unique filename
    out_path = f"mono_audio_{i}.wav"
    sf.write(out_path, mono_audio, sample_rates[i])


## Run Dirazation on Model

In [6]:
# Ensure the column exists before assignment
input_csv['pred_segments'] = None

for i in range(len(audio_signals)):
    audio_path = f"mono_audio_{i}.wav"
    try:
        predicted_segments = diar_model.diarize(audio=audio_path, batch_size=1)
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print(f"CUDA OOM on file {audio_path}, retrying on CPU...")
            diar_model = diar_model.to('cpu')
            predicted_segments = diar_model.diarize(audio=audio_path, batch_size=1)
            diar_model = diar_model.to('cuda')
        else:
            raise
    # If predicted_segments is a list of lists, extract the first element
    if isinstance(predicted_segments, list) and len(predicted_segments) > 0 and isinstance(predicted_segments[0], list):
        input_csv.at[input_csv.index[i], 'pred_segments'] = predicted_segments[0]
    else:
        input_csv.at[input_csv.index[i], 'pred_segments'] = predicted_segments
    torch.cuda.empty_cache()  # Optional: clear CUDA cache after each file


[NeMo I 2025-05-27 16:02:20 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]

Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]

[NeMo I 2025-05-27 16:02:21 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]

[NeMo I 2025-05-27 16:02:22 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s]

[NeMo I 2025-05-27 16:02:22 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]

[NeMo I 2025-05-27 16:02:23 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]

[NeMo I 2025-05-27 16:02:24 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  2.53it/s]

[NeMo I 2025-05-27 16:02:24 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.23it/s]

[NeMo I 2025-05-27 16:02:25 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]

[NeMo I 2025-05-27 16:02:26 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_8.wav, retrying on CPU...
[NeMo I 2025-05-27 16:02:27 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:40<00:00, 100.12s/it]

[NeMo I 2025-05-27 16:04:07 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_9.wav, retrying on CPU...
[NeMo I 2025-05-27 16:04:08 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [02:57<00:00, 177.55s/it]

[NeMo I 2025-05-27 16:07:06 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_10.wav, retrying on CPU...
[NeMo I 2025-05-27 16:07:06 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:53<00:00, 113.12s/it]

[NeMo I 2025-05-27 16:09:00 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.82s/it]

[NeMo I 2025-05-27 16:09:01 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]

[NeMo I 2025-05-27 16:09:02 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_13.wav, retrying on CPU...
[NeMo I 2025-05-27 16:09:03 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:52<00:00, 112.83s/it]

[NeMo I 2025-05-27 16:10:56 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]

[NeMo I 2025-05-27 16:10:57 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]

[NeMo I 2025-05-27 16:10:59 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]

[NeMo I 2025-05-27 16:11:01 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.93s/it]

[NeMo I 2025-05-27 16:11:03 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]

[NeMo I 2025-05-27 16:11:04 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]

[NeMo I 2025-05-27 16:11:05 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]

[NeMo I 2025-05-27 16:11:06 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_21.wav, retrying on CPU...
[NeMo I 2025-05-27 16:11:07 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:37<00:00, 97.66s/it]

[NeMo I 2025-05-27 16:12:44 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]

[NeMo I 2025-05-27 16:12:47 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]

[NeMo I 2025-05-27 16:12:48 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]

[NeMo I 2025-05-27 16:12:49 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_25.wav, retrying on CPU...
[NeMo I 2025-05-27 16:12:49 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:26<00:00, 86.25s/it]

[NeMo I 2025-05-27 16:14:16 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]

[NeMo I 2025-05-27 16:14:16 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]

[NeMo I 2025-05-27 16:14:17 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]

[NeMo I 2025-05-27 16:14:19 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.



Diarizing:   0%|          | 0/1 [00:00<?, ?it/s]


CUDA OOM on file mono_audio_29.wav, retrying on CPU...
[NeMo I 2025-05-27 16:14:19 vad_utils:81] No postprocessing YAML file has been provided. Default postprocessing configurations will be applied.


Diarizing: 100%|██████████| 1/1 [01:26<00:00, 86.28s/it]


## Reformart Result

In [10]:
for i, row in input_csv.iterrows():
    segments_list = row['pred_segments'] if isinstance(row['pred_segments'], list) and len(row['pred_segments']) > 0 else []
    formatted_segments = []
    for seg in segments_list:
        parts = seg.split()
        if len(parts) >= 3:
            formatted_segments.append((float(parts[0]), float(parts[1]), parts[2]))
    formatted_segments.sort(key=lambda x: x[0])
    input_csv.at[i, 'pred_segments'] = formatted_segments

In [11]:
# input_csv['pred_segments'] = ''
# for i, path_ in tqdm(enumerate(input_csv['path']), total=len(input_csv['path']), desc='Processing'):
#   # res = transcribe(path_)
#   # pred_segments = extract_segments(input_csv['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs'])
#   input_csv.at[i, 'pred_segments'] = pred_segments

## Convert Time to Seconds

In [12]:
def convert_time_to_seconds(timestamp):
    # Split the timestamp into minutes, seconds, and milliseconds
    minutes, seconds, milliseconds = map(float, timestamp.split(':'))
    # Convert the time to seconds (including fractional part from milliseconds)
    total_seconds = minutes * 60 + seconds + milliseconds / 1000
    return total_seconds


def extract_segments(transcript):
    # Regular expression to match the timestamp and speaker tag
    timestamp_pattern = r'(\d{2}:\d{2}:\d{2})'
    speaker_pattern = r'\[([^\]]+)\]'

    lines = transcript.strip().splitlines()
    segments = []

    start_time = None
    speaker_tag = None

    for i in range(len(lines)):
        if re.match(timestamp_pattern, lines[i]):  # Line is a timestamp
            if start_time and speaker_tag:
                # If we have both start and speaker, the current timestamp is the end time
                end_time = convert_time_to_seconds(lines[i])
                segments.append((start_time, end_time, speaker_tag))
                start_time = None
                speaker_tag = None
            # Set the new start time, converting to seconds
            start_time = convert_time_to_seconds(lines[i])
        elif re.match(speaker_pattern, lines[i]):  # Line contains a speaker tag
            speaker_tag = re.findall(speaker_pattern, lines[i])[0]

    return segments

## Obtain Ref_segment from Transcript

In [14]:
#ensure new line before speaker tags
input_csv['transcript'] = input_csv['transcript'].apply(lambda x: str(x).replace('[', '\r\n['))
input_csv['ref_segments'] = input_csv['transcript'].apply(lambda x: extract_segments(x))

# Save the DataFrame to a CSV file
input_csv.to_csv('/home/kelechi/Dialect-Classification/Diarization results/nemo_diarization.csv', index=False)

## DER Matrics

In [15]:
def create_pyannote_annotation(segments_list):
    annotation = Annotation()
    for start, end, speaker_tag in segments_list:
        segment = Segment(start, end)
        annotation[segment] = speaker_tag
    return annotation

der_metric = DiarizationErrorRate()

## Display Result

In [16]:
for i, text in tqdm(enumerate(input_csv['transcript']), total=len(input_csv['transcript']), desc="Processing"):
    ref_annotation = create_pyannote_annotation(input_csv.iloc[i]['ref_segments'])
    pred_annotation = create_pyannote_annotation(input_csv.iloc[i]['pred_segments'])
    der = der_metric(ref_annotation, pred_annotation)
    print(f"DER: {100 * der:.2f}%")
#get abs value for whole dataset
ds_der = abs(der_metric)
print(f"Absolute DER for dataset: {100 * ds_der:.2f}%")

    
Processing:  13%|█▎        | 4/30 [00:00<00:00, 35.54it/s]

DER: 38.83%
DER: 34.85%
DER: 39.28%
DER: 48.34%
DER: 30.83%
DER: 42.99%
DER: 27.56%
DER: 35.62%


Processing:  43%|████▎     | 13/30 [00:00<00:00, 26.87it/s]

DER: 48.36%
DER: 25.83%
DER: 29.42%
DER: 54.14%
DER: 11.00%


Processing:  53%|█████▎    | 16/30 [00:00<00:00, 21.56it/s]

DER: 13.42%
DER: 22.93%
DER: 38.98%
DER: 12.79%


Processing:  73%|███████▎  | 22/30 [00:00<00:00, 28.10it/s]

DER: 21.74%
DER: 17.12%
DER: 20.03%
DER: 7.44%
DER: 38.34%
DER: 15.51%
DER: 13.46%
DER: 33.68%
DER: 4.30%
DER: 21.15%
DER: 38.24%
DER: 11.16%


Processing: 100%|██████████| 30/30 [00:00<00:00, 33.24it/s]

DER: 51.83%
Absolute DER for dataset: 26.82%



