In [1]:
import os

folder_path = "../sample_audio"  # Replace with the actual path to your folder

# Get a list of all files in the folder
file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

# Print the list of file names relative to the root directory
for file in file_list:
    print(os.path.relpath(file, folder_path))


sg_parliament_4h.mp3
cna_sg_23min.mp3
sg_parliament_20min.mp3
sg_documentary_45min.mp3


In [9]:
import whisperx
import gc 

with open(r'../keys/hugging_face.txt', 'r') as fp:
    # read all lines using readline()
    lines = fp.readlines()
    for line in lines:
        HF_TOKEN = line

language = 'en'
device = "cuda" 
audio_file = "../sample_audio/sg_parliament_20min.mp3"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

def whisperx_transcribe(audio_file):
# 1. Transcribe with original whisper (batched)
    model = whisperx.load_model("small", device, compute_type=compute_type, language = language)

    # save model to local path (optional)
    # model_dir = "/path/"
    # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

    audio = whisperx.load_audio(audio_file)
    result = model.transcribe(audio, batch_size=batch_size, language = language)
    print(result["segments"]) # before alignment

    # delete model if low on GPU resources
    # import gc; gc.collect(); torch.cuda.empty_cache(); del model

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code='en', device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

    print(result["segments"]) # after alignment

    # delete model if low on GPU resources
    # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

    # 3. Assign speaker labels
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)

    # add min/max number of speakers if known
    diarize_segments = diarize_model(audio)
    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

    result = whisperx.assign_word_speakers(diarize_segments, result)
    print(diarize_segments)
    print(result["segments"]) # segments are now assigned speaker IDs
    return result

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
[{'text': " I have two buckets of clarification. One, with regard to the standards of the PAP that Prime Minister spoke of. And second, with regard to the circumstances involving the departure of former Speaker Tan Chuang Jin and Cheng Li Hui. So there has been much public disquiet about the transmission of information surrounding Minister Yiswan's arrest by the CPIB, and separately on the affair between Speaker Tan Chuang Jin and MP Cheng Li Hui.", 'start': 0.009, 'end': 27.449}, {'text': ' This is particularly with regard to what the Prime Minister knew and what the Prime Minister did about it over a period of almost three years before coming clean on the matter. So in this term of government, the government has either been slow to clear the air o

In [13]:
for segment in result['segments']:
    for word in segment['words']:
        print(word)

{'word': 'I', 'start': 0.81, 'end': 0.83, 'score': 0.372, 'speaker': 'SPEAKER_02'}
{'word': 'have', 'start': 0.89, 'end': 1.03, 'score': 0.882, 'speaker': 'SPEAKER_02'}
{'word': 'two', 'start': 1.05, 'end': 1.19, 'score': 0.779, 'speaker': 'SPEAKER_02'}
{'word': 'buckets', 'start': 1.23, 'end': 1.55, 'score': 0.938, 'speaker': 'SPEAKER_02'}
{'word': 'of', 'start': 1.59, 'end': 1.65, 'score': 0.836, 'speaker': 'SPEAKER_02'}
{'word': 'clarification.', 'start': 1.69, 'end': 2.451, 'score': 0.874, 'speaker': 'SPEAKER_02'}
{'word': 'One,', 'start': 2.651, 'end': 2.771, 'score': 0.856, 'speaker': 'SPEAKER_02'}
{'word': 'with', 'start': 3.612, 'end': 3.732, 'score': 0.881, 'speaker': 'SPEAKER_02'}
{'word': 'regard', 'start': 3.792, 'end': 4.072, 'score': 0.867, 'speaker': 'SPEAKER_02'}
{'word': 'to', 'start': 4.112, 'end': 4.232, 'score': 0.722, 'speaker': 'SPEAKER_02'}
{'word': 'the', 'start': 4.292, 'end': 4.372, 'score': 0.931, 'speaker': 'SPEAKER_02'}
{'word': 'standards', 'start': 4.432,

In [25]:
import os

# Define the directory to store transcripts
transcripts_dir = "../transcripts"

# Find the next available transcript file number
transcript_num = 1
while True:
    transcript_filename = f"whisperx_transcript{str(transcript_num).zfill(3)}.txt"
    if not os.path.exists(os.path.join(transcripts_dir, transcript_filename)):
        break
    transcript_num += 1

# Open the transcript file for writing
with open(os.path.join(transcripts_dir, transcript_filename), "w") as transcript_file:
    prev_speaker = None
    for segment in result['segments']:
        for word in segment['words']:
            if 'speaker' in word and word['speaker'] != prev_speaker:
                transcript_file.write(f"\n\n{word['speaker']}  {word['start']}\n")
                transcript_file.write(word['word'] + " ")
                prev_speaker = word['speaker']
            else:
                transcript_file.write(word['word'] + " ")

# Print the filename of the created transcript file
print(f"Transcript saved as: {transcript_filename}")

Transcript saved as: whisperx_transcript001.txt


In [22]:
file = open('myfile.txt', 'w')
file.close()


In [23]:
!pwd

/media/mldadmin/home/s123mdg310_03/Convo2Calendar/scripts
