
# 🎧 Transcrever WAV no Google Colab — com **Quem Falou** e **Horário**
Transcreve **WAV** com Whisper (`faster-whisper`) e gera **TXT / SRT / VTT** com:
- Rótulo de quem fala (se estéreo, separa L/R e rotula cada lado)
- Timestamp relativo e, opcionalmente, horário real se você informar `START_CLOCK_ISO`.


In [None]:

# ⬇️ Instalar dependências
!pip -q install faster-whisper==1.0.0 ffmpeg-python==0.2.0
!apt -qq install -y ffmpeg


In [None]:

# ✅ Checagens rápidas
import sys, os, platform, importlib
def ver(name):
    try:
        m = importlib.import_module(name); return getattr(m, "__version__", "?.?.?")
    except Exception as e:
        return f"(erro: {e})"
print("Python:", sys.version.split()[0], "| OS:", platform.system(), platform.release())
print("faster-whisper:", ver("faster_whisper"))
print("ffmpeg-python:", ver("ffmpeg"))
!ffmpeg -version | head -n 1 || echo "ffmpeg não encontrado"
gpu = os.popen("nvidia-smi -L 2>/dev/null").read().strip()
print("\nGPU:", gpu if gpu else "Sem GPU dedicada detectada")


In [None]:

# ⚙️ Configurações
from pathlib import Path
INPUT_DIR = Path("/content/entrada")
OUTPUT_DIR = Path("/content/saida")
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_SIZE = "large-v3"
LANG_HINT = "pt"

USE_VAD = True
VAD_MIN_SILENCE_MS = 600
VAD_SPEECH_PAD_MS  = 200
BEAM_SIZE = 8
BEST_OF = 5
TEMPERATURES = [0.0, 0.2, 0.4, 0.6]
COMPR_THR = 2.4
LOGPROB_THR = -1.2
NOSPEECH_THR = 0.45
CONDITION_ON_PREV = False

NORMALIZE = False
SPLIT_STEREO = True

LABEL_LEFT  = "Agente"
LABEL_RIGHT = "Cliente"

START_CLOCK_ISO = ""  # ex: "2025-09-02T09:30:00-03:00"

print("Entrada:", INPUT_DIR)
print("Saída:  ", OUTPUT_DIR)
print("Modelo:", MODEL_SIZE, "| Idioma:", (LANG_HINT or "(autodetect)"))


In [None]:

# 📁 (Opcional) Montar Google Drive
USE_DRIVE = False
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    INPUT_DIR = Path("/content/drive/MyDrive/transcricoes/entrada")
    OUTPUT_DIR = Path("/content/drive/MyDrive/transcricoes/saida")
    INPUT_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    print("Drive montado.")
    print("Entrada:", INPUT_DIR)
    print("Saída:  ", OUTPUT_DIR)


In [None]:

# ⬆️ Upload de WAVs
from google.colab import files
import shutil
from pathlib import Path

DO_UPLOAD = True
if DO_UPLOAD:
    print("Selecione um ou mais arquivos .wav...")
    up = files.upload()
    for name in up.keys():
        src = Path("/content") / name
        dst = INPUT_DIR / name
        if src.exists():
            shutil.move(str(src), str(dst))
            print("✔️ Movido para:", dst)
        else:
            print("⚠️ Não encontrei após upload:", src)
else:
    print("Upload desativado; usando arquivos de:", INPUT_DIR)


In [None]:

# 🧠 Funções
from datetime import datetime, timedelta
import ffmpeg
from faster_whisper import WhisperModel

def fmt_rel_ts(t: float) -> str:
    h = int(t // 3600); m = int((t % 3600) // 60); s = int(t % 60); ms = int((t - int(t)) * 1000)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

def fmt_wall_ts(start_clock, offset_s: float):
    if not start_clock:
        return None
    ts = start_clock + timedelta(seconds=offset_s)
    tz = ts.strftime("%z"); tz_fmt = f"{tz[:3]}:{tz[3:]}" if tz else ""
    return f"{ts.strftime('%H:%M:%S')} {tz_fmt}".strip()

def write_outputs(base_out: Path, segments, speaker: str | None = None, start_clock=None):
    txt_lines, srt_lines, vtt_lines = [], [], ["WEBVTT\n"]
    idx = 1
    for seg in segments:
        text = (seg.text or "").strip()
        if not text: continue
        rel_start = fmt_rel_ts(seg.start); rel_end = fmt_rel_ts(seg.end)
        wall = fmt_wall_ts(start_clock, seg.start)
        rel_hdr = rel_start[:8]; wall_hdr = f" | {wall}" if wall else ""
        spk = (speaker or "Speaker").strip()
        header_txt = f"[{rel_hdr}{wall_hdr}] {spk}: "
        full_txt = f"{header_txt}{text}"
        txt_lines.append(full_txt)
        srt_text_line = f"{spk}{' ['+wall+']' if wall else ''}: {text}"
        srt_lines += [str(idx), f"{rel_start} --> {rel_end}", srt_text_line, ""]
        vtt_lines += [f"{rel_start.replace(',', '.')} --> {rel_end.replace(',', '.')}", srt_text_line, ""]
        idx += 1
    (base_out.with_suffix(".txt")).write_text("\n".join(txt_lines), encoding="utf-8")
    (base_out.with_suffix(".srt")).write_text("\n".join(srt_lines), encoding="utf-8")
    (base_out.with_suffix(".vtt")).write_text("\n".join(vtt_lines), encoding="utf-8")

def is_stereo(wav_path: Path) -> bool:
    try:
        info = ffmpeg.probe(str(wav_path))
        for st in info.get("streams", []):
            if st.get("codec_type") == "audio":
                return int(st.get("channels", 1)) == 2
    except Exception:
        pass
    return False

def split_stereo(in_wav: Path, left_out: Path, right_out: Path):
    (ffmpeg.input(str(in_wav)).output(str(left_out), ac=1, map="0:a:0").overwrite_output().run(quiet=True))
    (ffmpeg.input(str(in_wav)).output(str(right_out), ac=1, map="0:a:1").overwrite_output().run(quiet=True))

def loudnorm(in_wav: Path, out_wav: Path, i_lufs="-16", tp_db="-1.5", lra="11"):
    (ffmpeg.input(str(in_wav)).output(str(out_wav), af=f"loudnorm=I={i_lufs}:TP={tp_db}:LRA={lra}").overwrite_output().run(quiet=True))

def build_model(model_size: str):
    device = "cpu"; compute_type = "int8"
    try:
        import torch
        if torch.cuda.is_available():
            device = "cuda"; compute_type = "float16"
    except Exception: pass
    print(f"Dispositivo: {device} | compute_type: {compute_type}")
    return WhisperModel(model_size, device=device, compute_type=compute_type)

def transcrever_um(
    wav_path: Path, out_dir: Path, model: WhisperModel, lang_hint: str | None,
    use_vad: bool, vad_min_silence_ms: int, vad_speech_pad_ms: int,
    beam_size: int, best_of: int, temperatures, compression_ratio_threshold: float,
    log_prob_threshold: float, no_speech_threshold: float, condition_on_previous_text: bool,
    speaker_label: str | None, start_clock=None,
):
    base_out = out_dir / wav_path.stem
    kwargs = dict(
        language=None if not lang_hint else lang_hint,
        vad_filter=use_vad, beam_size=beam_size, best_of=best_of,
        temperature=temperatures, compression_ratio_threshold=compression_ratio_threshold,
        log_prob_threshold=log_prob_threshold, no_speech_threshold=no_speech_threshold,
        condition_on_previous_text=condition_on_previous_text, word_timestamps=False
    )
    if use_vad:
        kwargs["vad_parameters"] = dict(min_silence_duration_ms=vad_min_silence_ms, speech_pad_ms=vad_speech_pad_ms)
    segments, info = model.transcribe(str(wav_path), **kwargs)
    write_outputs(base_out, segments, speaker=speaker_label, start_clock=start_clock)


In [None]:

# ▶️ Processar todos os WAVs
from pathlib import Path
from datetime import datetime

start_clock = None
if START_CLOCK_ISO.strip():
    try:
        start_clock = datetime.fromisoformat(START_CLOCK_ISO.strip())
    except Exception as e:
        print(f"[!] START_CLOCK_ISO inválido, ignorando ({e})")

model = build_model(MODEL_SIZE)

wav_files = sorted([p for p in INPUT_DIR.iterdir() if p.suffix.lower() == ".wav"])
if not wav_files:
    print("Nenhum .wav encontrado em:", INPUT_DIR)
else:
    for src in wav_files:
        print(f"\n[→] Processando: {src.name}")
        work = src
        tmp = []

        if NORMALIZE:
            norm = OUTPUT_DIR / f"{src.stem}.norm.tmp.wav"
            try:
                loudnorm(src, norm)
                work = norm; tmp.append(norm)
                print("   • Normalizado (loudnorm)")
            except Exception as e:
                print(f"   ! Falha normalização: {e}")

        if SPLIT_STEREO and is_stereo(work):
            L = OUTPUT_DIR / f"{src.stem}.L.tmp.wav"
            R = OUTPUT_DIR / f"{src.stem}.R.tmp.wav"
            try:
                split_stereo(work, L, R)
                print("   • Estéreo → separando L/R")
                transcrever_um(L, OUTPUT_DIR, model, LANG_HINT, USE_VAD, VAD_MIN_SILENCE_MS, VAD_SPEECH_PAD_MS,
                               BEAM_SIZE, BEST_OF, TEMPERATURES, COMPR_THR, LOGPROB_THR, NOSPEECH_THR,
                               CONDITION_ON_PREV, LABEL_LEFT, start_clock)
                print("   • Canal L transcrito")
                transcrever_um(R, OUTPUT_DIR, model, LANG_HINT, USE_VAD, VAD_MIN_SILENCE_MS, VAD_SPEECH_PAD_MS,
                               BEAM_SIZE, BEST_OF, TEMPERATURES, COMPR_THR, LOGPROB_THR, NOSPEECH_THR,
                               CONDITION_ON_PREV, LABEL_RIGHT, start_clock)
                print("   • Canal R transcrito")
                tmp += [L, R]
            except Exception as e:
                print(f"   ! Falha split estéreo, transcrevendo mix: {e}")
                transcrever_um(work, OUTPUT_DIR, model, LANG_HINT, USE_VAD, VAD_MIN_SILENCE_MS, VAD_SPEECH_PAD_MS,
                               BEAM_SIZE, BEST_OF, TEMPERATURES, COMPR_THR, LOGPROB_THR, NOSPEECH_THR,
                               CONDITION_ON_PREV, None, start_clock)
        else:
            transcrever_um(work, OUTPUT_DIR, model, LANG_HINT, USE_VAD, VAD_MIN_SILENCE_MS, VAD_SPEECH_PAD_MS,
                           BEAM_SIZE, BEST_OF, TEMPERATURES, COMPR_THR, LOGPROB_THR, NOSPEECH_THR,
                           CONDITION_ON_PREV, None, start_clock)

        for p in tmp:
            try: p.unlink()
            except Exception: pass

        print(f"[✓] Finalizado: {src.name}")

print("\nConcluído. Resultados em:", OUTPUT_DIR)


In [None]:

# ⬇️ Baixar resultados
from google.colab import files
out_files = sorted(OUTPUT_DIR.glob("*"))
if not out_files:
    print("Nada para baixar. Rode o processamento primeiro.")
else:
    for f in out_files:
        print("Baixando:", f.name)
        files.download(str(f))
