# Audio Transcription with Vertector

Demonstrates:
- Whisper model configuration
- MLX vs Standard backend
- Timestamped transcriptions
- Multi-language support
- Batch processing
- SRT generation

## Setup

In [None]:
from pathlib import Path
from vertector_data_ingestion import (
    create_audio_transcriber,
    AudioConfig,
    WhisperModelSize,
    AudioBackend,
    HardwareDetector,
    setup_logging,
)

setup_logging(log_level="INFO")

## Hardware Detection

In [None]:
hw_info = HardwareDetector.get_device_info()

print("Hardware:")
print(f"  Device: {hw_info.get('device_type')}")
print(f"  Chip: {hw_info.get('chip', 'Unknown')}")
print(f"  Use MLX: {hw_info.get('use_mlx', False)}")
print(f"  Batch Size: {hw_info.get('batch_size', 1)}")

if hw_info.get('device_type') == 'mps':
    print("\n✓ Recommend: MLX backend (10-20x faster on Apple Silicon)")
elif hw_info.get('device_type') == 'cuda':
    print("\n✓ Recommend: Standard with CUDA")
else:
    print("\n✓ Recommend: Standard (CPU)")

## Basic Transcription

In [None]:
config = AudioConfig(
    model_size=WhisperModelSize.BASE,
    backend=AudioBackend.AUTO,
    language="en",
    word_timestamps=True,
)

transcriber = create_audio_transcriber(config)
audio_path = Path("../test_documents/harvard.wav")

if audio_path.exists():
    result = transcriber.transcribe(audio_path)
    
    print("Result:")
    print(f"  Text: {result.text}")
    print(f"  Language: {result.language}")
    print(f"  Duration: {result.duration:.2f}s")
    print(f"  Segments: {len(result.segments)}")
else:
    print(f"File not found: {audio_path}")

## Timestamped Segments

In [None]:
if audio_path.exists():
    for i, segment in enumerate(result.segments[:5], 1):
        print(f"\nSegment {i}:")
        print(f"  Time: [{segment.start:.1f}s - {segment.end:.1f}s]")
        print(f"  Text: {segment.text}")

## Model Size Comparison

In [None]:
import time

if audio_path.exists():
    models = [WhisperModelSize.TINY, WhisperModelSize.BASE, WhisperModelSize.SMALL]
    
    print("Model Comparison:")
    for model_size in models:
        config = AudioConfig(model_size=model_size, backend=AudioBackend.AUTO)
        transcriber = create_audio_transcriber(config)
        
        start = time.time()
        result = transcriber.transcribe(audio_path)
        elapsed = time.time() - start
        
        print(f"\n{model_size.value.upper()}: {elapsed:.2f}s")
        print(f"  Text: {result.text[:100]}...")

## Multi-Language

In [None]:
if audio_path.exists():
    # Auto-detect
    auto_config = AudioConfig(model_size=WhisperModelSize.BASE, language=None)
    transcriber = create_audio_transcriber(auto_config)
    result = transcriber.transcribe(audio_path)
    
    print(f"Detected language: {result.language}")
    print(f"Text: {result.text[:200]}...")

## Batch Processing

In [None]:
audio_dir = Path("../test_documents/")

if audio_dir.exists():
    audio_files = list(audio_dir.glob("*.wav")) + list(audio_dir.glob("*.mp3"))
    
    if audio_files:
        config = AudioConfig(model_size=WhisperModelSize.BASE)
        transcriber = create_audio_transcriber(config)
        
        for audio_file in audio_files[:5]:
            result = transcriber.transcribe(audio_file)
            print(f"\n{audio_file.name}: {result.duration:.1f}s")
            print(f"  {result.text[:100]}...")
else:
    print("Create '../test_documents/' directory with audio files")

## Generate SRT Subtitles

In [None]:
def format_srt_timestamp(seconds: float) -> str:
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

if audio_path.exists():
    config = AudioConfig(model_size=WhisperModelSize.BASE, word_timestamps=True)
    transcriber = create_audio_transcriber(config)
    result = transcriber.transcribe(audio_path)
    
    # Generate SRT
    srt_output = []
    for i, segment in enumerate(result.segments, 1):
        start = format_srt_timestamp(segment.start)
        end = format_srt_timestamp(segment.end)
        srt_output.append(f"{i}\n{start} --> {end}\n{segment.text.strip()}\n")
    
    srt_content = "\n".join(srt_output)
    
    # Save using convert_and_export pattern
    from vertector_data_ingestion import UniversalConverter
    converter = UniversalConverter()
    srt_path = converter.config.output_dir / "transcript.srt"
    srt_path.parent.mkdir(parents=True, exist_ok=True)
    srt_path.write_text(srt_content, encoding="utf-8")
    
    print(f"Saved to: {srt_path}")

## Summary

Demonstrated:
- Hardware detection
- Basic transcription
- Timestamped segments
- Model comparison
- Multi-language support
- Batch processing
- SRT generation

Next: `03_rag_pipeline.ipynb`