In [None]:
# Whisper (from GitHub) and core libraries
!pip install git+https://github.com/openai/whisper.git
!pip install torch torchvision torchaudio pydub
!pip install pyannote.audio

# Ensure ffmpeg is installed (usually already available in Colab)
!apt-get install -y ffmpeg


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-4b8b1s2x
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-4b8b1s2x
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_c

In [2]:
# =========================
# 📦 STEP 1: INSTALL DEPENDENCIES
# =========================

!pip install git+https://github.com/openai/whisper.git
!pip install pyannote-audio pydub torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!sudo apt-get install -y ffmpeg

# =========================
# ⚙️ STEP 2: IMPORTS AND DEVICE SETUP
# =========================

import os
import torch
from pydub import AudioSegment
from pyannote.audio import Pipeline
import whisper
from pyannote.core import Segment

print("✅ CUDA available:", torch.cuda.is_available())
print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🖥️ Using device:", device)

# =========================
# 🔁 STEP 3: UPLOAD AUDIO FILE
# =========================

from google.colab import files
uploaded = files.upload()

# Replace with your actual file name if different
AUDIO_PATH = list(uploaded.keys())[0]  # Automatically uses uploaded file name
CHUNK_DURATION_MIN = 5

# =========================
# 🔊 STEP 4: SPLIT AUDIO INTO CHUNKS
# =========================

def split_audio(input_path, chunk_length_min=5):
    audio = AudioSegment.from_file(input_path)
    chunk_length_ms = chunk_length_min * 60 * 1000
    chunks = []
    os.makedirs("chunks", exist_ok=True)
    for i, start in enumerate(range(0, len(audio), chunk_length_ms)):
        end = min(start + chunk_length_ms, len(audio))
        chunk = audio[start:end]
        chunk_path = f"chunks/chunk_{i:03d}.wav"
        chunk.export(chunk_path, format="wav")
        chunks.append(chunk_path)
    return chunks

# =========================
# 🤖 STEP 5: LOAD MODELS TO GPU
# =========================

HUGGINGFACE_TOKEN = "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"  # Replace with your own token

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization@2.1",
    use_auth_token=HUGGINGFACE_TOKEN
).to(device)

model = whisper.load_model("large", device=device)

# =========================
# 🧠 STEP 6: DIARIZATION & TRANSCRIPTION
# =========================

chunks = split_audio(AUDIO_PATH, CHUNK_DURATION_MIN)
all_results = []

for chunk_path in chunks:
    print(f"\n📦 Processing {chunk_path} ...")
    diarization = pipeline(chunk_path)
    result = model.transcribe(chunk_path, language="ur", task="translate")

    for ws in result["segments"]:
        whisper_seg = Segment(ws["start"], ws["end"])
        for turn in diarization.itertracks(yield_label=True):
            spk_seg, _, speaker = turn
            if whisper_seg.intersects(spk_seg):
                aligned = {
                    "speaker": speaker,
                    "start": ws["start"],
                    "end": ws["end"],
                    "text": ws["text"]
                }
                all_results.append(aligned)
                break

# =========================
# 💾 STEP 7: SAVE OUTPUT
# =========================

with open("final_aligned_transcript.txt", "w", encoding="utf-8") as f:
    for entry in all_results:
        f.write(f"{entry['speaker']} [{entry['start']:.2f}-{entry['end']:.2f}]: {entry['text']}\n")

print("\n✅ Done. Output saved to final_aligned_transcript.txt")

# =========================
# 📤 OPTIONAL: DOWNLOAD THE TRANSCRIPT
# =========================

files.download("final_aligned_transcript.txt")


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-la2btkw6
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-la2btkw6
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://download.pytorch.org/whl/cu118
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
✅ CUDA available: True
Torch version: 2.6.0+cu124
CUDA version: 12.4
🖥️ Using device: cuda


Saving audio.wav to audio.wav


config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml' -> '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyann

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /root/.cache/torch/pyannote/speechbrain/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt' -> '/root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /root/.cac


📦 Processing chunks/chunk_000.wav ...


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.




📦 Processing chunks/chunk_001.wav ...

📦 Processing chunks/chunk_002.wav ...

📦 Processing chunks/chunk_003.wav ...

📦 Processing chunks/chunk_004.wav ...

📦 Processing chunks/chunk_005.wav ...

📦 Processing chunks/chunk_006.wav ...

📦 Processing chunks/chunk_007.wav ...

📦 Processing chunks/chunk_008.wav ...

📦 Processing chunks/chunk_009.wav ...

📦 Processing chunks/chunk_010.wav ...

📦 Processing chunks/chunk_011.wav ...

📦 Processing chunks/chunk_012.wav ...

📦 Processing chunks/chunk_013.wav ...

📦 Processing chunks/chunk_014.wav ...

📦 Processing chunks/chunk_015.wav ...

📦 Processing chunks/chunk_016.wav ...

📦 Processing chunks/chunk_017.wav ...

📦 Processing chunks/chunk_018.wav ...

📦 Processing chunks/chunk_019.wav ...

📦 Processing chunks/chunk_020.wav ...

📦 Processing chunks/chunk_021.wav ...

📦 Processing chunks/chunk_022.wav ...

✅ Done. Output saved to final_aligned_transcript.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>