In [None]:
pip install speechbrain pyannote.audio torch torchaudio


Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading p

In [None]:
"""
INSTALL DEPENDENCIES FIRST:
    pip install speechbrain pyannote.audio torch torchaudio

ALSO:
    1) Get your Hugging Face access token: https://huggingface.co/settings/tokens
    2) Replace 'YOUR_HF_TOKEN' below.
"""

import torch
from speechbrain.pretrained import SepformerSeparation as separator
from pyannote.audio import Pipeline
import torchaudio
import os

# === SETTINGS ===
INPUT_FILE = "MUSK.wav"  # <- your audio file with overlapping speakers
HUGGINGFACE_TOKEN = "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"

# === STEP 0: Select device ===
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# === STEP 1: Load SpeechBrain SepFormer ===
print("Loading SpeechBrain SepFormer model...")
sepformer = separator.from_hparams(
    source="speechbrain/sepformer-wsj02mix",
    savedir="pretrained_models/sepformer-wsj02mix",
    run_opts={"device": str(DEVICE)}
)

# === STEP 2: Separate overlapping voices ===
print(f"Separating sources in {INPUT_FILE} ...")
# Use device explicitly (SpeechBrain uses run_opts for device)
est_sources = sepformer.separate_file(path=INPUT_FILE)

# === STEP 3: Save each separated speaker ===
# === STEP 3: Save each separated speaker ===
output_dir = "separated_speakers"
os.makedirs(output_dir, exist_ok=True)

print(f"Saving separated audio to ./{output_dir}/ ...")
for i, source in enumerate(est_sources):
    output_path = os.path.join(output_dir, f"speaker_{i+1}.wav")
    # Explicitly set channels_first to False as the source tensor is likely [frames, channels]
    torchaudio.save(output_path, source.cpu(), 8000, channels_first=False)
    print(f"  -> {output_path}")

print("\n✅ Separation done.")

# === STEP 4: Load pyannote-audio diarization pipeline ===
print("\nLoading pyannote-audio diarization pipeline...")
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN,
    device=DEVICE  # <<< use your GPU here
)

# === STEP 5: Run diarization for each separated speaker ===
for i in range(len(est_sources)):
    wav_file = os.path.join(output_dir, f"speaker_{i+1}.wav")
    print(f"\n=== Diarization for {wav_file} ===")
    diarization = pipeline(wav_file)
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        print(f"{turn.start:.1f}s - {turn.end:.1f}s: Speaker {speaker}")

print("\n✅ ALL DONE: Separated overlapping speech and diarized speakers using CUDA.")


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wsj02mix' if not cached


Using device: cuda
Loading SpeechBrain SepFormer model...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/sepformer-wsj02mix.
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/masknet.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["masknet"] = /content/pretrained_models/sepformer-wsj02mix/masknet.ckpt
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["encoder"] = /content/pretrained_models/sepformer-wsj02mix/encoder.ckpt
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/decoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["decoder"] = /content/pretrained_models/sepformer-wsj02mix/decoder.ckpt
INFO:speechbrain.utils.parame

Separating sources in MUSK.wav ...
Resampling the audio from 44100 Hz to 8000 Hz
Saving separated audio to ./separated_speakers/ ...
  -> separated_speakers/speaker_1.wav

✅ Separation done.

Loading pyannote-audio diarization pipeline...


TypeError: Pipeline.from_pretrained() got an unexpected keyword argument 'device'

In [None]:
"""
INSTALL DEPENDENCIES FIRST:
    pip install speechbrain pyannote.audio torch torchaudio

ALSO:
    1) Get your Hugging Face access token: https://huggingface.co/settings/tokens
    2) Replace 'YOUR_HF_TOKEN' below.
"""

import torch
from speechbrain.pretrained import SepformerSeparation as separator
from pyannote.audio import Pipeline
import torchaudio
import os

# === SETTINGS ===
INPUT_FILE = "MUSK.wav"  # <- your audio file with overlapping speakers
HUGGINGFACE_TOKEN = "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"

# === STEP 0: Select device ===
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# === STEP 1: Load SpeechBrain SepFormer ===
print("Loading SpeechBrain SepFormer model...")
sepformer = separator.from_hparams(
    source="speechbrain/sepformer-wsj02mix",
    savedir="pretrained_models/sepformer-wsj02mix",
    run_opts={"device": str(DEVICE)}
)

# === STEP 2: Separate overlapping voices ===
print(f"Separating sources in {INPUT_FILE} ...")
est_sources = sepformer.separate_file(path=INPUT_FILE)

# === STEP 3: Save each separated speaker ===
output_dir = "separated_speakers"
os.makedirs(output_dir, exist_ok=True)

print(f"Saving separated audio to ./{output_dir}/ ...")
for i, source in enumerate(est_sources):
    output_path = os.path.join(output_dir, f"speaker_{i+1}.wav")
    # SpeechBrain outputs [channels, frames] → torchaudio expects [frames, channels] by default
    source = source.transpose(0, 1)  # [frames, channels]
    torchaudio.save(output_path, source.cpu(), 8000)
    print(f"  -> {output_path}")

print("\n✅ Separation done.")

# === STEP 4: Load pyannote-audio diarization pipeline ===
print("\nLoading pyannote-audio diarization pipeline...")
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
)
pipeline.to(DEVICE)  # ✅ set device here

# === STEP 5: Run diarization for each separated speaker ===
for i in range(len(est_sources)):
    wav_file = os.path.join(output_dir, f"speaker_{i+1}.wav")
    print(f"\n=== Diarization for {wav_file} ===")
    diarization = pipeline(wav_file)
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        print(f"{turn.start:.1f}s - {turn.end:.1f}s: Speaker {speaker}")

print("\n✅ ALL DONE: Separated overlapping speech and diarized speakers using CUDA.")


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wsj02mix' if not cached


Using device: cuda
Loading SpeechBrain SepFormer model...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/sepformer-wsj02mix.
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/masknet.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["masknet"] = /content/pretrained_models/sepformer-wsj02mix/masknet.ckpt
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["encoder"] = /content/pretrained_models/sepformer-wsj02mix/encoder.ckpt
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/decoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["decoder"] = /content/pretrained_models/sepformer-wsj02mix/decoder.ckpt
INFO:speechbrain.utils.parame

Separating sources in MUSK.wav ...
Resampling the audio from 44100 Hz to 8000 Hz
Saving separated audio to ./separated_speakers/ ...
  -> separated_speakers/speaker_1.wav

✅ Separation done.

Loading pyannote-audio diarization pipeline...


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]


=== Diarization for separated_speakers/speaker_1.wav ===


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


0.4s - 0.6s: Speaker SPEAKER_00
1.7s - 4.9s: Speaker SPEAKER_00
7.1s - 11.5s: Speaker SPEAKER_00
7.7s - 7.9s: Speaker SPEAKER_01
8.7s - 9.1s: Speaker SPEAKER_01
10.1s - 10.6s: Speaker SPEAKER_01
12.0s - 18.8s: Speaker SPEAKER_00
12.2s - 12.6s: Speaker SPEAKER_01
22.2s - 24.1s: Speaker SPEAKER_00
26.7s - 29.3s: Speaker SPEAKER_00
29.5s - 31.7s: Speaker SPEAKER_00
33.1s - 36.1s: Speaker SPEAKER_00
36.3s - 38.2s: Speaker SPEAKER_00
39.4s - 41.6s: Speaker SPEAKER_00
42.4s - 45.3s: Speaker SPEAKER_00
46.8s - 50.9s: Speaker SPEAKER_00
47.5s - 47.7s: Speaker SPEAKER_01
52.1s - 55.8s: Speaker SPEAKER_00
55.5s - 55.7s: Speaker SPEAKER_01
58.6s - 59.6s: Speaker SPEAKER_00
60.5s - 60.9s: Speaker SPEAKER_00

✅ ALL DONE: Separated overlapping speech and diarized speakers using CUDA.


In [None]:
"""
INSTALL DEPENDENCIES FIRST:
    pip install speechbrain pyannote.audio torch torchaudio transformers soundfile

ALSO:
    1) Get your Hugging Face access token: https://huggingface.co/settings/tokens
    2) Replace 'YOUR_HF_TOKEN' below.
"""

import torch
from speechbrain.pretrained import SepformerSeparation as separator
from pyannote.audio import Pipeline
import torchaudio
import os
from transformers import pipeline as transformers_pipeline
from datetime import timedelta
torch.cuda.empty_cache()
# === SETTINGS ===
INPUT_FILE = "MUSK.wav"  # Your audio file with overlapping speakers
HUGGINGFACE_TOKEN = "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"
LANGUAGE = "ur"  # Language code: "ur" for Urdu, "en" for English, etc.
OUTPUT_DIR = "output_results"
SAMPLE_RATE = 16000  # Standard sample rate for most ASR models

# Predefined speaker mappings (voice fingerprints)
# Format: {"speaker_id": {"name": "Your Name", "reference_audio": "path/to/audio.wav"}}
SPEAKER_MAPPINGS = {
    # Example - add your reference audio files here
    "SPEAKER_01": {"name": "muskan", "reference_audio": "muskan_voice_sample.wav"},
    "SPEAKER_02": {"name": "Elon", "reference_audio": "elon_voice_sample.wav"},
}

# === INITIALIZATION ===
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "separated_speakers"), exist_ok=True)

# === STEP 1: Load SpeechBrain SepFormer ===
print("Loading SpeechBrain SepFormer model...")
sepformer = separator.from_hparams(
    source="speechbrain/sepformer-wsj02mix",
    savedir="pretrained_models/sepformer-wsj02mix",
    run_opts={"device": str(DEVICE)}
)

# === STEP 2: Separate overlapping voices ===
print(f"Separating sources in {INPUT_FILE}...")
est_sources = sepformer.separate_file(path=INPUT_FILE)

# === STEP 3: Save each separated speaker ===
print("Saving separated audio files...")
separated_files = []
for i, source in enumerate(est_sources):
    output_path = os.path.join(OUTPUT_DIR, "separated_speakers", f"speaker_{i+1}.wav")
    source = source.squeeze(0).transpose(0, 1)  # [frames, channels]
    torchaudio.save(output_path, source.cpu(), SAMPLE_RATE)
    separated_files.append(output_path)
    print(f"  -> {output_path}")

print("\n✅ Separation done.")

# === STEP 4: Load pyannote-audio diarization pipeline ===
print("\nLoading pyannote-audio diarization pipeline...")
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
).to(DEVICE)

# === STEP 5: Load ASR pipeline ===
print("Loading automatic speech recognition (ASR) pipeline...")

# Select appropriate model based on language
if LANGUAGE == "ur":
    # Whisper model for Urdu
    asr_pipeline = transformers_pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-medium",
        device=DEVICE
    )
else:
    # Default to multilingual model
    asr_pipeline = transformers_pipeline(
        "automatic-speech-recognition",
        model="facebook/wav2vec2-large-xlsr-53",
        device=DEVICE
    )

# === STEP 6: Process each separated speaker ===
print("\nProcessing each speaker...")

# Dictionary to store final results
final_results = []

for i, audio_path in enumerate(separated_files):
    print(f"\nProcessing speaker {i+1}...")

    # === Diarization ===
    print("  Running diarization...")
    diarization = diarization_pipeline(audio_path)

    # === Speaker Identification ===
    speaker_id = f"SPEAKER_{i+1:02d}"
    speaker_name = speaker_id  # Default to ID if no mapping found

    # Check if we have a predefined mapping for this speaker
    for mapped_id, info in SPEAKER_MAPPINGS.items():
        # In a real application, you'd compare voice fingerprints here
        # For this example, we'll just match by order if files exist
        if os.path.exists(info["reference_audio"]):
            speaker_name = info["name"]
            break

    # === Transcription ===
    print("  Transcribing audio...")

    # Load audio file
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if needed
    if sample_rate != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
        waveform = resampler(waveform)

    # Convert to numpy array and normalize
    audio_np = waveform.numpy().squeeze()
    audio_np = audio_np / max(abs(audio_np.max()), abs(audio_np.min()))

    # Run ASR
    transcription = asr_pipeline(
        audio_np,
        generate_kwargs={"language": LANGUAGE},
        chunk_length_s=30,
        stride_length_s=5,
    )["text"]

    # Format the results
    result_entry = {
        "speaker_id": speaker_id,
        "speaker_name": speaker_name,
        "transcription": transcription,
        "timestamps": []
    }

    # Add timestamps from diarization
    for turn, _, _ in diarization.itertracks(yield_label=True):
        result_entry["timestamps"].append({
            "start": str(timedelta(seconds=round(turn.start, 2))),
            "end": str(timedelta(seconds=round(turn.end, 2))),
            "duration": round(turn.end - turn.start, 2)
        })

    final_results.append(result_entry)

# === STEP 7: Save and display results ===
print("\n=== FINAL RESULTS ===")
for result in final_results:
    print(f"\n{result['speaker_name']} said:")
    print(result['transcription'])
    print("\nSpeaking segments:")
    for segment in result['timestamps']:
        print(f"  {segment['start']} - {segment['end']}")

# Save results to text file
output_file = os.path.join(OUTPUT_DIR, "transcription_results.txt")
with open(output_file, "w", encoding="utf-8") as f:
    for result in final_results:
        f.write(f"\n{result['speaker_name']} said:\n")
        f.write(result['transcription'] + "\n")
        f.write("\nSpeaking segments:\n")
        for segment in result['timestamps']:
            f.write(f"  {segment['start']} - {segment['end']}\n")

print(f"\n✅ ALL DONE! Results saved to {output_file}")


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wsj02mix' if not cached


Using device: cuda
Loading SpeechBrain SepFormer model...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/sepformer-wsj02mix.
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/masknet.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["masknet"] = /content/pretrained_models/sepformer-wsj02mix/masknet.ckpt
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["encoder"] = /content/pretrained_models/sepformer-wsj02mix/encoder.ckpt
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/decoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["decoder"] = /content/pretrained_models/sepformer-wsj02mix/decoder.ckpt
INFO:speechbrain.utils.parame

Separating sources in MUSK.wav...
Resampling the audio from 44100 Hz to 8000 Hz
Saving separated audio files...
  -> output_results/separated_speakers/speaker_1.wav

✅ Separation done.

Loading pyannote-audio diarization pipeline...
Loading automatic speech recognition (ASR) pipeline...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cuda



Processing each speaker...

Processing speaker 1...
  Running diarization...


  std = sequences.std(dim=-1, correction=1)


  Transcribing audio...


ValueError: We expect a single channel audio input for AutomaticSpeechRecognitionPipeline

In [2]:
pip install fastapi uvicorn pyngrok speechbrain pyannote.audio openai-whisper torchaudio

Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (

In [None]:
"""
Install requirements first:
  pip install fastapi uvicorn pyngrok speechbrain pyannote.audio openai-whisper torchaudio

Put your Hugging Face token below.
"""

import os
from pathlib import Path
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import JSONResponse
import torch
from pyngrok import ngrok
from pyannote.audio import Pipeline, Model, Audio
from speechbrain.pretrained import SepformerSeparation as separator
import whisper
from pyannote.audio.core.io import Audio
from pyannote.core import Segment
import torchaudio
from pyannote.core import Segment
import numpy as np
from pyannote.audio import Inference
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("✅ CUDA cache cleared.")
# Add your authtoken here
ngrok.set_auth_token("2s7GMbTPpGQYKgIVi7BGjc8tJl9_93xb2SdHD7LhVS1ibvTr")
import nest_asyncio
nest_asyncio.apply()

# === CONFIG ===
HUGGINGFACE_TOKEN = "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"
KNOWN_SPEAKERS_DIR = "known_speakers"
os.makedirs(KNOWN_SPEAKERS_DIR, exist_ok=True)
DB_PATH = "speaker_db.pt"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# === Load models ===
print("Loading models...")
sepformer = separator.from_hparams(
    source="speechbrain/sepformer-wsj02mix",
    savedir="pretrained_models/sepformer-wsj02mix",
    run_opts={"device": str(DEVICE)}
)
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HUGGINGFACE_TOKEN
)
pipeline.to(DEVICE)
embedding_model = Model.from_pretrained(
    "pyannote/embedding",
    use_auth_token=HUGGINGFACE_TOKEN
)
embedding_model.to(DEVICE)
whisper_model = whisper.load_model("medium", device=str(DEVICE))
audio_util = Audio()

# === Load or init Speaker DB ===
if os.path.exists(DB_PATH):
    speaker_db = torch.load(DB_PATH)
    print("✅ Loaded existing Speaker DB.")
else:
    speaker_db = {}
    print("✅ Created new Speaker DB.")

# === FastAPI ===
app = FastAPI()



@app.post("/add_speaker")
async def add_speaker(
    file: UploadFile,
    name: str = Form(...)
):
    # Create speakers directory if it doesn't exist
    os.makedirs("speakers", exist_ok=True)

    # Save the uploaded file with original extension
    temp_path = f"./speakers/temp_{name}{Path(file.filename).suffix}"
    with open(temp_path, "wb") as f:
        f.write(await file.read())

    try:
        # Load audio file (supports both WAV and MP3)
        waveform, sr = torchaudio.load(temp_path)

        # Convert stereo to mono if needed
        if waveform.dim() == 2 and waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        elif waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)

        # Resample to 16kHz if needed (pyannote expects 16kHz)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform = resampler(waveform)
            sr = 16000

        # Save as WAV for consistency
        wav_path = f"./speakers/{name}.wav"
        torchaudio.save(wav_path, waveform, sr)

        # Move to device
        waveform = waveform.to(DEVICE)

        # Create embedding using Inference class (correct way for pyannote)
        from pyannote.audio import Inference
        embedding_fn = Inference(
            embedding_model,
            window="whole",
            device=DEVICE
        )

        # Get embedding - this will return a numpy array directly
        embedding = embedding_fn({
            "waveform": waveform,
            "sample_rate": sr
        })

        # Save embedding
        embedding_path = f"./speakers/{name}.npy"
        np.save(embedding_path, embedding)

        # Update speaker database
        speaker_db[name] = torch.from_numpy(embedding)
        torch.save(speaker_db, DB_PATH)

        return JSONResponse(content={
            "status": "success",
            "message": f"Speaker {name} added successfully",
            "embedding_shape": embedding.shape
        })

    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"status": "error", "message": str(e)}
        )
    finally:
        # Clean up temporary file
        if os.path.exists(temp_path):
            os.remove(temp_path)






MIN_SEGMENT_LENGTH = 0.5  # Minimum segment length in seconds

@app.post("/upload_audio")
async def upload_audio(file: UploadFile):
    """Upload multi-speaker audio, separate, diarize, transcribe, and match names."""
    try:
        # Save uploaded file
        input_file = "temp_input.wav"
        with open(input_file, "wb") as f:
            f.write(await file.read())

        # Separate overlapping speech
        est_sources = sepformer.separate_file(path=input_file)
        output_dir = "separated_speakers"
        os.makedirs(output_dir, exist_ok=True)

        # Save separated sources
        for i, source in enumerate(est_sources):
            source = source.transpose(0, 1)  # [channels, time]
            torchaudio.save(
                os.path.join(output_dir, f"speaker_{i+1}.wav"),
                source.cpu(),
                8000
            )

        results = []

        # Initialize embedding inference
        embedding_fn = Inference(
            "pyannote/embedding",
            window="whole",
            device=DEVICE,
            use_auth_token=HUGGINGFACE_TOKEN
        )

        # Process each separated file
        for spk_file in sorted(Path(output_dir).glob("speaker_*.wav")):
            spk_file_str = str(spk_file)

            # Diarize
            diarization = pipeline(spk_file_str)

            for turn, _, spk_label in diarization.itertracks(yield_label=True):
                segment_length = turn.end - turn.start
                if segment_length < MIN_SEGMENT_LENGTH:
                    continue  # Skip segments that are too short

                seg_wav = f"temp_seg_{spk_label}_{int(turn.start*100):06}.wav"

                try:
                    # Create segment and crop
                    segment = Segment(float(turn.start), float(turn.end))
                    cropped_waveform, sample_rate = audio_util.crop(spk_file_str, segment)

                    # Skip if we got empty audio
                    if cropped_waveform.numel() == 0:
                        continue

                    # Ensure proper tensor dimensions [channels, time]
                    if cropped_waveform.dim() == 1:
                        cropped_waveform = cropped_waveform.unsqueeze(0)
                    elif cropped_waveform.size(0) > 1:  # multi-channel to mono
                        cropped_waveform = cropped_waveform.mean(dim=0, keepdim=True)

                    # Pad short segments
                    min_samples = 16000 * MIN_SEGMENT_LENGTH  # 16kHz * 0.5s
                    if cropped_waveform.size(1) < min_samples:
                        padding = min_samples - cropped_waveform.size(1)
                        cropped_waveform = torch.nn.functional.pad(
                            cropped_waveform,
                            (0, padding),
                            mode='constant'
                        )

                    # Save with proper dimensions
                    torchaudio.save(
                        seg_wav,
                        cropped_waveform.cpu(),
                        sample_rate
                    )

                    # Transcribe
                    result = whisper_model.transcribe(seg_wav, language="en")
                    text = result["text"].strip()

                    # Get embedding
                    waveform, sr = torchaudio.load(seg_wav)
                    if waveform.dim() == 2 and waveform.size(0) > 1:
                        waveform = waveform.mean(dim=0, keepdim=True)

                    embedding = embedding_fn({
                        "waveform": waveform.to(DEVICE),
                        "sample_rate": sr
                    })
                    seg_emb = torch.from_numpy(embedding).cpu()

                    # Match to known speakers
                    best_score = -1
                    best_name = "Unknown"
                    for name, ref_emb in speaker_db.items():
                        score = torch.nn.functional.cosine_similarity(
                            seg_emb.unsqueeze(0),
                            ref_emb.unsqueeze(0),
                            dim=1
                        ).item()
                        if score > best_score:
                            best_score = score
                            best_name = name

                    final_name = best_name if best_score > 0.75 else "Unknown"
                    results.append({
                        "speaker": final_name,
                        "start": float(turn.start),
                        "end": float(turn.end),
                        "text": text,
                        "similarity": float(best_score)
                    })

                except Exception as e:
                    print(f"Error processing segment {turn.start}-{turn.end}: {str(e)}")
                    continue
                finally:
                    if os.path.exists(seg_wav):
                        os.remove(seg_wav)

        return JSONResponse(content={"results": results})

    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"status": "error", "message": str(e)}
        )
    finally:
        # Clean up files
        if os.path.exists(input_file):
            os.remove(input_file)
        if os.path.exists(output_dir):
            for f in Path(output_dir).glob("speaker_*.wav"):
                os.remove(f)



# === Run in Colab ===
public_url = ngrok.connect(8001)
print(f"🚀 PUBLIC URL: {public_url}")

import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)
import uvicorn
from threading import Thread

def run():
    uvicorn.run(app, host="0.0.0.0", port=8001)

Thread(target=run).start()

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
  from speechbrain.pretrained import SepformerSeparation as separator
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wsj02mix' if not cached


✅ CUDA cache cleared.
Using device: cuda
Loading models...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/sepformer-wsj02mix.
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/masknet.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["masknet"] = /content/pretrained_models/sepformer-wsj02mix/masknet.ckpt
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["encoder"] = /content/pretrained_models/sepformer-wsj02mix/encoder.ckpt
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Using symlink found at '/content/pretrained_models/sepformer-wsj02mix/decoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["decoder"] = /content/pretrained_models/sepformer-wsj02mix/decoder.ckpt
INFO:speechbrain.utils.parame

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['loss_func.W']


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.
✅ Loaded existing Speaker DB.


INFO:     Started server process [36019]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)


🚀 PUBLIC URL: NgrokTunnel: "https://c14e-34-16-158-97.ngrok-free.app" -> "http://localhost:8001"
Resampling the audio from 44100 Hz to 8000 Hz


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


  std = sequences.std(dim=-1, correction=1)


Error processing segment 10.10534375-10.61159375: pad(): argument 'pad' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got float"
Error processing segment 58.587218750000005-59.56596875: pad(): argument 'pad' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got float"
INFO:     2400:adc1:41d:600:9803:8812:e7ce:9c82:0 - "POST /upload_audio HTTP/1.1" 200 OK


In [3]:
!pip install -q nest_asyncio