In [None]:
from faster_whisper import WhisperModel
import time
import datetime
from tqdm.notebook import tqdm


def format_srt_time(seconds):
    """Converte segundos em um formato de timestamp SRT (HH:MM:SS,ms)."""
    delta = datetime.timedelta(seconds=seconds)
    total_seconds = int(delta.total_seconds())
    microseconds = delta.microseconds
    
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = microseconds // 1000
    
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

model_size = "large-v3"
model = WhisperModel(model_size, device="cuda", compute_type="float16")

file_name = "Entrevista (2025-08-21 19_18 GMT-3)"
audio_file = file_name + ".mp4"

print("Iniciando a transcrição...")
start_time = time.time()

segments, info = model.transcribe(audio_file, beam_size=20)

print(f"Idioma detectado: {info.language} com probabilidade de {info.language_probability:.2f}")


srt_content = ""
segment_count = 1
for segment in segments:
    start_str = format_srt_time(segment.start)
    end_str = format_srt_time(segment.end)
    text = segment.text.strip()
    
    srt_block = f"{segment_count}\n{start_str} --> {end_str}\n{text}\n\n"
    
    srt_content += srt_block
    
    print(f"[{start_str} --> {end_str}] {text}")
    
    segment_count += 1

end_time = time.time()
print("\nTranscrição finalizada.")
print(f"Tempo total: {end_time - start_time:.2f} segundos.")

srt_file_name = file_name + ".srt"
with open(srt_file_name, "w", encoding="utf-8") as f:
    f.write(srt_content)

print(f"\nLegenda salva em '{srt_file_name}'")

  import pkg_resources


Iniciando a transcrição...
Idioma detectado: pt com probabilidade de 1.00
[00:00:00,000 --> 00:00:06,760] acho que agora foi
[00:00:06,760 --> 00:00:14,540] vou confirmar aqui
[00:00:24,260 --> 00:00:25,160] tá bom
[00:00:25,160 --> 00:00:26,440] vamos lá
[00:00:26,440 --> 00:00:28,860] eu tenho
[00:00:28,860 --> 00:00:31,140] quatro blocos de perguntas
[00:00:31,140 --> 00:00:32,000] aqui
[00:00:32,000 --> 00:00:34,620] vou passar aqui
[00:00:34,620 --> 00:00:36,540] de um por um, começo de uma forma
[00:00:36,540 --> 00:00:38,800] mais geral, digamos assim, depois a gente vai
[00:00:38,800 --> 00:00:40,540] sendo mais específico
[00:00:43,420 --> 00:00:44,600] eu vou usar
[00:00:44,600 --> 00:00:46,720] essas perguntas aqui
[00:00:46,720 --> 00:00:48,660] vou pegar
[00:00:48,660 --> 00:00:50,300] coisas usadas com várias pessoas
[00:00:50,300 --> 00:00:52,200] de vários perfis, para depois comprolar
[00:00:52,200 --> 00:00:54,260] um resultado geral
[00:00:54,260 --> 00:00:56,260] be