In [1]:
import json,time
import whisperx
import dotenv
import os
dotenv.load_dotenv(dotenv.find_dotenv())
HF_TOKEN = os.getenv('HF_TOKEN')

device = 'cuda'
save_raw_transcript = True
save_align_transcript = True
save_diarize_transcript = True
wav_file = '/home/user/_meeting-notes-data/Finance-Meeting_test.mp4_files/Finance-Meeting_test-15m.wav'

start = time.time()
model = whisperx.load_model(
	'large-v2', device=device, compute_type='float16'
)
print('Model loaded in {:.2f} seconds'.format(time.time() - start))
start = time.time()
audio = whisperx.load_audio(wav_file)
print('Audio loaded in {:.2f} seconds'.format(time.time() - start))

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.2+cu121. Bad things might happen unless you revert torch to 1.x.
Model loaded in 1.78 seconds
Audio loaded in 0.12 seconds


In [2]:
start = time.time()
raw_transcript = model.transcribe(
	audio, batch_size=20, language='en' # adjust batch_size to fit your GPU's memory
)
print(
	'Transcribed in {:.2f} seconds'.format(time.time() - start)
)
if save_raw_transcript:
	with open(
		wav_file + '--raw_transcript.json', 'w', encoding='utf-8'
	) as f:
		f.write(json.dumps(raw_transcript))
raw_transcript

Transcribed in 22.99 seconds


{'segments': [{'text': " everybody. So, we'll start with the apologies, and I guess we have Hazel. Everyone else seems to be here, I think. So, could I have a mover, please? Andrew, just before you do, I have to leave the meeting at 1.30 to attend a future-proof meeting. So, sorry about that. So, my apologies from them. Good as gold. Thanks, Jim. So, should we include that in the meeting?",
   'start': 0.009,
   'end': 29.411},
  {'text': " in the motion as well. If someone would like to move, please. Thanks, Lou. Thanks, Susan. All in favour? Aye. Contrary, no. It's carried. Disclosure of members' interests. Do we have any today? No. I'll move on to item four, late items. Any late items?",
   'start': 29.753,
   'end': 54.104},
  {'text': " No late items, so confirmation of order of meeting. No changes, Ken? No, I don't see any need for change. We do have a number of guests joining us at the meeting today, but they are well aware of the time. So, yes, I'm hoping they'll be here at the

In [3]:
start = time.time()
model_a, metadata = whisperx.load_align_model(
	language_code=raw_transcript["language"], device=device
)
align_transcript = whisperx.align(
	raw_transcript["segments"],
	model_a,
	metadata,
	audio,
	device=device,
	return_char_alignments=False
)
print(
	'Aligned in {:.2f} seconds'.format(time.time() - start)
)
if save_align_transcript:
	with open(
		wav_file + '--align_transcript.json', 'w', encoding='utf-8'
	) as f:
		f.write(json.dumps(align_transcript))
align_transcript

Aligned in 7.56 seconds


{'segments': [{'start': 1.37,
   'end': 2.071,
   'text': ' everybody.',
   'words': [{'word': 'everybody.',
     'start': 1.37,
     'end': 2.071,
     'score': 0.788}]},
  {'start': 4.733,
   'end': 8.896,
   'text': "So, we'll start with the apologies, and I guess we have Hazel.",
   'words': [{'word': 'So,', 'start': 4.733, 'end': 5.053, 'score': 0.916},
    {'word': "we'll", 'start': 5.753, 'end': 5.933, 'score': 0.774},
    {'word': 'start', 'start': 5.953, 'end': 6.174, 'score': 0.97},
    {'word': 'with', 'start': 6.214, 'end': 6.314, 'score': 0.752},
    {'word': 'the', 'start': 6.334, 'end': 6.394, 'score': 0.738},
    {'word': 'apologies,', 'start': 6.474, 'end': 7.054, 'score': 0.895},
    {'word': 'and', 'start': 7.114, 'end': 7.214, 'score': 0.863},
    {'word': 'I', 'start': 7.294, 'end': 7.415, 'score': 0.596},
    {'word': 'guess', 'start': 7.535, 'end': 7.755, 'score': 0.615},
    {'word': 'we', 'start': 7.795, 'end': 7.935, 'score': 0.967},
    {'word': 'have', 'star

In [5]:
diarize_transcript = None
if HF_TOKEN:
	start = time.time()
	diarize_model = whisperx.DiarizationPipeline(
		use_auth_token=HF_TOKEN, device=device
	)
	diarize_segments = diarize_model(audio)
	diarize_transcript = whisperx.assign_word_speakers(
		diarize_segments, align_transcript
	)
	print(
		'Diarized in {:.2f} seconds'.format(time.time() - start)
	)
	if save_diarize_transcript:
		with open(
			wav_file + '--diarize_transcript.json', 'w', encoding='utf-8'
		) as f:
			f.write(json.dumps(diarize_transcript))
diarize_transcript

Diarized in 27.08 seconds


{'segments': [{'start': 1.37,
   'end': 2.071,
   'text': ' everybody.',
   'words': [{'word': 'everybody.',
     'start': 1.37,
     'end': 2.071,
     'score': 0.788,
     'speaker': 'SPEAKER_02'}],
   'speaker': 'SPEAKER_02'},
  {'start': 4.733,
   'end': 8.896,
   'text': "So, we'll start with the apologies, and I guess we have Hazel.",
   'words': [{'word': 'So,',
     'start': 4.733,
     'end': 5.053,
     'score': 0.916,
     'speaker': 'SPEAKER_02'},
    {'word': "we'll",
     'start': 5.753,
     'end': 5.933,
     'score': 0.774,
     'speaker': 'SPEAKER_02'},
    {'word': 'start',
     'start': 5.953,
     'end': 6.174,
     'score': 0.97,
     'speaker': 'SPEAKER_02'},
    {'word': 'with',
     'start': 6.214,
     'end': 6.314,
     'score': 0.752,
     'speaker': 'SPEAKER_02'},
    {'word': 'the',
     'start': 6.334,
     'end': 6.394,
     'score': 0.738,
     'speaker': 'SPEAKER_02'},
    {'word': 'apologies,',
     'start': 6.474,
     'end': 7.054,
     'score': 0.8

In [None]:
# how can we reduce the granularity of the transcript?
# there are many spots where the same speaker has several consecutive segments of text
#   without any other speaker interjecting
# these could be merged together, maybe taking time gaps into account