In [1]:
!pip install git+https://github.com/m-bain/whisperx.git
!pip install ipywidgets

Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /tmp/pip-req-build-z8_ll1s_
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-z8_ll1s_
  Resolved https://github.com/m-bain/whisperx.git to commit f2da2f858e99e4211fe4f64b5f2938b007827e17
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import whisperx
import torch
import ipywidgets as widgets
from IPython.display import display

# Check for available device
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 4
compute_type = "float16" if device == "cuda" else "int8"

# File upload widget
file_upload = widgets.FileUpload(accept='.wav,.mp3', multiple=False)

# Language selection dropdown
language_dropdown = widgets.Dropdown(
    options=[('English', 'en'), ('Spanish', 'es'), ('French', 'fr'), ('German', 'de'),
             ('Italian', 'it'), ('Portuguese', 'pt'), ('Chinese', 'zh')],
    value='en',
    description='Language:'
)

# Transcribe button
transcribe_button = widgets.Button(description="Transcribe")

# Output widget
output = widgets.Output()

# Diarization model
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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", device=device)

def on_transcribe_button_clicked(b):
    with output:
        output.clear_output()
        if file_upload.value:
            audio_file = list(file_upload.value.values())[0]
            audio_file_path = f"/content/{audio_file['metadata']['name']}"
            with open(audio_file_path, "wb") as f:
                f.write(audio_file['content'])

            language_code = language_dropdown.value

            # Load and transcribe audio
            audio = whisperx.load_audio(audio_file_path)
            model = whisperx.load_model("large-v2", device, compute_type=compute_type)
            result = model.transcribe(audio, batch_size=batch_size)

            # Print transcription without diarization
            print("Transcription without diarization:")
            for segment in result["segments"]:
                start = segment["start"]
                end = segment["end"]
                text = segment["text"]
                print(f"[{start:.2f} - {end:.2f}] {text}")

            # Diarization
            diarization_result = diarize_model(audio_file_path)
            print("\nDiarization Result:")
            print(diarization_result)

            if "segments" in diarization_result:
                speaker_segments = diarization_result["segments"]
            else:
                print("Error: 'segments' key not found in diarization result.")
                return

            # Align whisper output
            model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device)
            aligned_result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

            # Display result with diarization
            print("\nTranscription with diarization:")
            for segment in aligned_result["segments"]:
                start = segment["start"]
                end = segment["end"]
                text = segment["text"]

                # Find the corresponding speaker segment
                speaker = None
                for speaker_segment in speaker_segments:
                    if speaker_segment["start"] <= start < speaker_segment["end"]:
                        speaker = speaker_segment["speaker"]
                        break

                print(f"[{start:.2f} - {end:.2f}] Speaker {speaker}: {text}")

            # Cleanup
            del model
            torch.cuda.empty_cache()
        else:
            print("Please upload an audio file.")

transcribe_button.on_click(on_transcribe_button_clicked)

display(file_upload, language_dropdown, transcribe_button, output)


  torchaudio.set_audio_backend("soundfile")


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

FileUpload(value={}, accept='.wav,.mp3', description='Upload')

Dropdown(description='Language:', options=(('English', 'en'), ('Spanish', 'es'), ('French', 'fr'), ('German', …

Button(description='Transcribe', style=ButtonStyle())

Output()