In [3]:
import numpy as np
import ffmpeg

SAMPLE_RATE = 16000
input_file = "store/audio.mp3"

try:
    out, _ = (
        ffmpeg.input(input_file)
        .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=str(SAMPLE_RATE))
        .run(capture_stdout=True, capture_stderr=True)
    )
except ffmpeg.Error as e:
    print("stdout:", e.stdout.decode("utf8"))
    print("stderr:", e.stderr.decode("utf8"))
    raise e
buffer = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
length_in_seconds = len(buffer) / SAMPLE_RATE

In [7]:
from whisperx import DiarizationPipeline, assign_word_speakers
import yaml
secrets = yaml.safe_load(open("secrets.yml"))

diarize_model = DiarizationPipeline(
    use_auth_token=secrets["HF"], device="cuda"
)
diarize_segments = diarize_model(
    buffer, min_speakers=None, max_speakers=None
)
diarize_segments = diarize_model(buffer)
result = assign_word_speakers(diarize_segments, result)

In [1]:
# instantiate the pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook
from pyannote.audio import Pipeline
import yaml
import torch

secrets = yaml.safe_load(open('secrets.yml'))
pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token=secrets["HF"])
pipeline.to(torch.device("cuda"))

# run the pipeline on an audio file
# diarization = pipeline("store/audio.mp3")
with ProgressHook() as hook:
    diarization = pipeline("store/audio.mp3", hook=hook)

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


Output()

In [6]:
from utils.whisper_util import transcribe
transcription = transcribe(audio_path="store/audio.mp3", align=True)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.1+cu118. Bad things might happen unless you revert torch to 1.x.
1. Transcribing...
Detected language: en (1.00) in first 30s of audio...
2. Aligning...
Detected audio with length 232.5710625 seconds


In [8]:
transcription["segments"]

[{'start': 0.249,
  'end': 5.173,
  'text': ' Just one year after launch, Chat GPT has well over 100 million daily active users.',
  'words': [{'word': 'Just', 'start': 0.249, 'end': 0.449, 'score': 0.698},
   {'word': 'one', 'start': 0.609, 'end': 0.709, 'score': 0.685},
   {'word': 'year', 'start': 0.75, 'end': 0.91, 'score': 0.804},
   {'word': 'after', 'start': 0.99, 'end': 1.21, 'score': 0.728},
   {'word': 'launch,', 'start': 1.27, 'end': 1.59, 'score': 0.942},
   {'word': 'Chat', 'start': 1.67, 'end': 1.85, 'score': 0.891},
   {'word': 'GPT', 'start': 1.87, 'end': 2.371, 'score': 0.761},
   {'word': 'has', 'start': 2.411, 'end': 2.571, 'score': 0.822},
   {'word': 'well', 'start': 2.631, 'end': 2.811, 'score': 0.875},
   {'word': 'over', 'start': 2.911, 'end': 3.111, 'score': 0.838},
   {'word': '100'},
   {'word': 'million', 'start': 3.712, 'end': 4.052, 'score': 0.879},
   {'word': 'daily', 'start': 4.092, 'end': 4.372, 'score': 0.863},
   {'word': 'active', 'start': 4.472, 'e

In [7]:
from utils.whisper_util import segments_to_srt
segments = transcription['segments']
srt = segments_to_srt(segments)
srt

['1\n0:00:00:000 --> 0:00:05:000\nJust one year after launch, Chat GPT has well over 100 million daily active users.\n\n',
 '2\n0:00:05:000 --> 0:00:11:000\nAnd next week, OpenAI is opening the floodgates, allowing developers to profit on their platform by selling custom GPT agents.\n\n',
 "3\n0:00:11:000 --> 0:00:18:000\nAll you have to do is convince 1% of the user base to pay you $1 per month for your custom agent, and you'll be a deck of millionaire by the end of the year.\n\n",
 "4\n0:00:18:000 --> 0:00:23:000\nBy the end of today's video, you'll know how to get rich from this once-in-a-lifetime shovelware opportunity.\n\n",
 '5\n0:00:23:000 --> 0:00:26:000\nIt is January 5th, 2024, and you are watching the code report.\n\n',
 '6\n0:00:26:000 --> 0:00:29:000\nBuilding a custom GPT agent is way too easy.\n\n',
 '7\n0:00:29:000 --> 0:00:43:000\nLiterally all you do is go down to myGPTs, create a new agent, give it some custom instructions and conversation starters, and maybe upload 