# ***STAGE 1: AUDIO PREPROCESSING***

In [12]:
pip install faster-whisper sentence-transformers librosa soundfile scikit-learn

Collecting faster-whisper
  Using cached faster_whisper-1.2.1-py3-none-any.whl.metadata (16 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Using cached onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting av>=11 (from faster-whisper)
  Using cached av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Using cached faster_whisper-1.2.1-py3-none-any.whl (1.1 MB)
Using cached av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
Using cached onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.4 MB)
Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Installing collected packages: coloredlogs, av, onnxruntime, faster-whisper
Successfully installed av-16.0.1 coloredlogs-15.0.1 faster-whisper-1.2.1 onnxruntime-1.23.2


In [13]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126


In [14]:
pip install tensorflow[and-cuda]



In [15]:
!ffmpeg -err_detect ignore_err -i content/audio_data/input2.wav -ac 1 -ar 16000 -acodec pcm_s16le outputs/fixed_audio.wav

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [16]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: False
GPU: CPU


In [18]:
import json
import torch
import librosa
import numpy as np
import soundfile as sf
from pathlib import Path
from faster_whisper import WhisperModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import logging

# ================= CONFIG =================
AUDIO_FILE = "outputs/fixed_audio.wav"
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(exist_ok=True)

MODEL_SIZE = "medium"  # small / medium / large-v2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"

TOPIC_THRESHOLD = 0.35
SR = 16000

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

# ================= PIPELINE =================
class PodcastPipelineCUDA:
    def __init__(self):
        logging.info(f"Using device: {DEVICE}")

        self.asr = WhisperModel(
            MODEL_SIZE,
            device=DEVICE,
            compute_type=COMPUTE_TYPE
        )

        self.embedder = SentenceTransformer(
            "all-MiniLM-L6-v2",
            device=DEVICE
        )

        self.current_topic_emb = None
        self.topic_id = 0

        self.segments = []
        self.chapters = []

    def preprocess_audio(self, path):
        y, sr = librosa.load(path, sr=None, mono=True)
        if sr != SR:
            y = librosa.resample(y, sr, SR)
        y = y / (np.max(np.abs(y)) + 1e-9)
        sf.write(OUT_DIR / "processed_audio.wav", y, SR)
        return y

    def run(self, audio_path):
        self.preprocess_audio(audio_path)

        segments, _ = self.asr.transcribe(
            audio_path,
            vad_filter=True,
            beam_size=5
        )

        for seg in segments:
            text = seg.text.strip()

            if len(text) < 3:
                self._store_effect(seg)
                continue

            speaker = self._detect_speaker(seg)
            topic_change = self._detect_topic(text)

            if topic_change:
                self._start_new_chapter(seg.start)

            self._store_segment(seg, speaker)

        self._finalize()

    # ---------- Speaker (placeholder, extendable) ----------
    def _detect_speaker(self, seg):
        return "Speaker 1" if int(seg.start) % 2 == 0 else "Speaker 2"

    # ---------- Topic Detection ----------
    def _detect_topic(self, text):
        emb = self.embedder.encode(text, convert_to_numpy=True).reshape(1, -1)

        if self.current_topic_emb is None:
            self.current_topic_emb = emb
            self.topic_id = 1
            self._start_new_chapter(0.0)
            return False

        sim = cosine_similarity(emb, self.current_topic_emb)[0][0]

        if sim < TOPIC_THRESHOLD:
            self.current_topic_emb = emb
            self.topic_id += 1
            return True
        else:
            self.current_topic_emb = 0.7 * self.current_topic_emb + 0.3 * emb
            return False

    # ---------- Storage ----------
    def _store_segment(self, seg, speaker):
        self.segments.append({
            "start": seg.start,
            "end": seg.end,
            "speaker": speaker,
            "topic": self.topic_id,
            "text": seg.text.strip()
        })

        self.chapters[-1]["end"] = seg.end
        self.chapters[-1]["texts"].append(seg.text.strip())

    def _store_effect(self, seg):
        self.segments.append({
            "start": seg.start,
            "end": seg.end,
            "speaker": "[Music]",
            "topic": None,
            "text": "[Music]"
        })

    def _start_new_chapter(self, start):
        self.chapters.append({
            "topic_id": self.topic_id,
            "start": start,
            "end": start,
            "texts": []
        })

    def _finalize(self):
        transcript = [
            f"[{int(s['start']//60):02}:{int(s['start']%60):02}] "
            f"{s['speaker']}: {s['text']}"
            for s in self.segments
        ]

        for ch in self.chapters:
            words = " ".join(ch["texts"]).lower().split()
            keywords = sorted(set(words), key=words.count, reverse=True)[:6]
            ch["title"] = " ".join(keywords).title()
            del ch["texts"]

        (OUT_DIR / "transcript.txt").write_text("\n".join(transcript))
        (OUT_DIR / "segments.json").write_text(json.dumps(self.segments, indent=2))
        (OUT_DIR / "chapters.json").write_text(json.dumps(self.chapters, indent=2))

        logging.info("CUDA pipeline completed")

# ================= RUN =================
if __name__ == "__main__":
    pipeline = PodcastPipelineCUDA()
    pipeline.run(AUDIO_FILE)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]