In [7]:
import torch
import whisperx

In [2]:
params = {
    "vocal_target":"/Users/lucasjackson/Workspace/tensorlake/test-files/podcast-short.mp3",
    "language":None,
    "batch_size": 0,
    "model":"distil-medium.en",
    "supress_numerals": False,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Get whisper results and language

In [4]:
mtypes = {"cpu": "int8", "cuda": "float16"}

if params.get("batch_size") != 0:
    from transcription_helpers import transcribe_batched
    whisper_results, language = transcribe(
        params.get("vocal_target"),
        params.get("language"),
        params.get("batch_size"),
        params.get("model"),
        mtypes[params.get("device")],
        params.get("supress_numerals"),
        params.get("device"),
    )
else:
    from transcription_helpers import transcribe
    whisper_results, language = transcribe(
        params.get("vocal_target"),
        params.get("language"),
        params.get("model"),
        mtypes[params.get("device")],
        params.get("supress_numerals"),
        params.get("device"),
    )

torchvision is not available - cannot save figures
[NeMo W 2024-03-22 13:34:20 nemo_logging:393] Could not import NeMo NLP collection which is required for speech translation model.


# word alignment to timestamps

In [12]:
from helpers import wav2vec2_langs, filter_missing_timestamps

In [13]:
if language in wav2vec2_langs:
    alignment_model, metadata = whisperx.load_align_model(
        language_code=language, device=params.get("device")
    )
    result_aligned = whisperx.align(
        whisper_results, alignment_model, metadata, params.get("vocal_target"), params.get("device")
    )
    word_timestamps = filter_missing_timestamps(
        result_aligned["word_segments"],
        initial_timestamp=whisper_results[0].get("start"),
        final_timestamp=whisper_results[-1].get("end"),
    )
    # clear gpu vram
    del alignment_model
    torch.cuda.empty_cache()
else:
    assert (
        params.get("batch_size") == 0  # TODO: add a better check for word timestamps existence
    ), (
        f"Unsupported language: {language}, use --batch_size to 0"
        " to generate word timestamps using whisper directly and fix this error."
    )
    word_timestamps = []
    for segment in whisper_results:
        for word in segment["words"]:
            word_timestamps.append({"word": word[2], "start": word[0], "end": word[1]})


# Nemo process for automatic speech recognition

In [20]:
from pydub import AudioSegment
import os
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from helpers import create_config

# convert audio to mono for NeMo combatibility
sound = AudioSegment.from_file(params.get("vocal_target")).set_channels(1)
ROOT = os.getcwd()
temp_path = os.path.join(ROOT, "temp_outputs")
os.makedirs(temp_path, exist_ok=True)
sound.export(os.path.join(temp_path, "mono_file.wav"), format="wav")

# Initialize NeMo MSDD diarization model
msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to(params.get("device"))
msdd_model.diarize()

del msdd_model
torch.cuda.empty_cache()

[NeMo I 2024-03-22 13:39:34 nemo_logging:381] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-03-22 13:39:34 nemo_logging:381] Found existing object /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-03-22 13:39:34 nemo_logging:381] Re-using file from: /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-03-22 13:39:34 nemo_logging:381] Instantiating model from pre-trained checkpoint


[NeMo W 2024-03-22 13:39:35 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-03-22 13:39:35 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-03-22 13:39:35 nemo_logging:393] Please call the ModelPT.setup_test_data() or ModelPT

[NeMo I 2024-03-22 13:39:35 nemo_logging:381] PADDING: 16
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] PADDING: 16
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Model EncDecDiarLabelModel was successfully restored from /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] PADDING: 16
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Found existing object /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Re-using file from: /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Instantiating model

[NeMo W 2024-03-22 13:39:35 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy

[NeMo I 2024-03-22 13:39:35 nemo_logging:381] PADDING: 16
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Model EncDecClassificationModel was successfully restored from /Users/lucasjackson/.cache/torch/NeMo/NeMo_1.21.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false
    }
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Number of files to diarize: 1
[NeMo I 2024-03-22 13:39:35 nemo_logging:381] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.87it/s]

[NeMo I 2024-03-22 13:39:36 nemo_logging:381] Perform streaming frame-level VAD
[NeMo I 2024-03-22 13:39:36 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:39:36 nemo_logging:381] Dataset loaded with 1 items, total duration of  0.01 hours.
[NeMo I 2024-03-22 13:39:36 nemo_logging:381] # 1 files loaded accounting to # 1 labels



vad: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.59s/it]

[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Generating predictions with overlapping input segments



                                                                                                                                                                                                                                                          

[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.03it/s]

[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Subsegmentation for embedding extraction: scale0, /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Extracting embeddings for Diarization
[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:39:46 nemo_logging:381] Dataset loaded with 23 items, total duration of  0.00 hours.
[NeMo I 2024-03-22 13:39:46 nemo_logging:381] # 23 files loaded accounting to # 1 labels



[1/5] extract embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.12s/it]

[NeMo I 2024-03-22 13:40:09 nemo_logging:381] Saved embedding files to /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-03-22 13:40:09 nemo_logging:381] Subsegmentation for embedding extraction: scale1, /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-03-22 13:40:09 nemo_logging:381] Extracting embeddings for Diarization
[NeMo I 2024-03-22 13:40:09 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:40:09 nemo_logging:381] Dataset loaded with 25 items, total duration of  0.00 hours.
[NeMo I 2024-03-22 13:40:09 nemo_logging:381] # 25 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.58s/it]

[NeMo I 2024-03-22 13:40:28 nemo_logging:381] Saved embedding files to /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-03-22 13:40:28 nemo_logging:381] Subsegmentation for embedding extraction: scale2, /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-03-22 13:40:28 nemo_logging:381] Extracting embeddings for Diarization
[NeMo I 2024-03-22 13:40:28 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:40:28 nemo_logging:381] Dataset loaded with 28 items, total duration of  0.00 hours.
[NeMo I 2024-03-22 13:40:28 nemo_logging:381] # 28 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.97s/it]

[NeMo I 2024-03-22 13:40:46 nemo_logging:381] Saved embedding files to /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-03-22 13:40:46 nemo_logging:381] Subsegmentation for embedding extraction: scale3, /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-03-22 13:40:46 nemo_logging:381] Extracting embeddings for Diarization
[NeMo I 2024-03-22 13:40:46 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:40:46 nemo_logging:381] Dataset loaded with 33 items, total duration of  0.00 hours.
[NeMo I 2024-03-22 13:40:46 nemo_logging:381] # 33 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.94s/it]

[NeMo I 2024-03-22 13:41:00 nemo_logging:381] Saved embedding files to /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings
[NeMo I 2024-03-22 13:41:00 nemo_logging:381] Subsegmentation for embedding extraction: scale4, /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/subsegments_scale4.json
[NeMo I 2024-03-22 13:41:00 nemo_logging:381] Extracting embeddings for Diarization
[NeMo I 2024-03-22 13:41:00 nemo_logging:381] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-03-22 13:41:00 nemo_logging:381] Dataset loaded with 46 items, total duration of  0.01 hours.
[NeMo I 2024-03-22 13:41:00 nemo_logging:381] # 46 files loaded accounting to # 1 labels



[5/5] extract embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.40s/it]

[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Saved embedding files to /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings



[NeMo W 2024-03-22 13:41:14 nemo_logging:393] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.
clustering: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.30it/s]

[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Outputs are saved in /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs directory



[NeMo W 2024-03-22 13:41:14 nemo_logging:393] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Loading embedding pickle file of scale:0 at /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Loading embedding pickle file of scale:1 at /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Loading embedding pickle file of scale:2 at /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Loading embedding pickle file of scale:3 at /Users/lucasjackson/Workspace/tensorlake/indexify-extractors/audio/whisper-diarization/temp_outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 71.40it/s]

[NeMo I 2024-03-22 13:41:14 nemo_logging:381]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Number of files to diarize: 1
[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Number of files to diarize: 1



[NeMo W 2024-03-22 13:41:14 nemo_logging:393] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Number of files to diarize: 1


[NeMo W 2024-03-22 13:41:14 nemo_logging:393] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-03-22 13:41:14 nemo_logging:381] Number of files to diarize: 1


[NeMo W 2024-03-22 13:41:14 nemo_logging:393] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-03-22 13:41:14 nemo_logging:381]   
    


# reading timestamps <> speaker label mappings

In [27]:
from helpers import get_words_speaker_mapping, punct_model_langs

speaker_ts = []
with open(os.path.join(temp_path, "pred_rttms", "mono_file.rttm"), "r") as f:
    lines = f.readlines()
    for line in lines:
        line_list = line.split(" ")
        s = int(float(line_list[5]) * 1000)
        e = s + int(float(line_list[8]) * 1000)
        speaker_ts.append([s, e, int(line_list[11].split("_")[-1])])

wsm = get_words_speaker_mapping(word_timestamps, speaker_ts, "start")

In [None]:
from deepmultilingualpunctuation import PunctuationModel
import re

if language in punct_model_langs:
    # restoring punctuation in the transcript to help realign the sentences
    punct_model = PunctuationModel(model="kredor/punctuate-all")

    words_list = list(map(lambda x: x["word"], wsm))

    labled_words = punct_model.predict(words_list)

    ending_puncts = ".?!"
    model_puncts = ".,;:!?"

    # We don't want to punctuate U.S.A. with a period. Right?
    is_acronym = lambda x: re.fullmatch(r"\b(?:[a-zA-Z]\.){2,}", x)

    for word_dict, labeled_tuple in zip(wsm, labled_words):
        word = word_dict["word"]
        if (
            word
            and labeled_tuple[1] in ending_puncts
            and (word[-1] not in model_puncts or is_acronym(word))
        ):
            word += labeled_tuple[1]
            if word.endswith(".."):
                word = word.rstrip(".")
            word_dict["word"] = word

else:
    logging.warning(
        f"Punctuation restoration is not available for {language} language. Using the original punctuation."
    )