In [1]:
!pip install openai-whisper --no-cache-dir
!pip install pydub
!pip install psutil
!pip install ffmpeg


import os
import whisper
import gc
from pydub import AudioSegment
import tempfile
import psutil
import time
from google.colab import drive
import shutil



In [None]:
def transcribe_audio(model, file_name, output_file, max_attempts=3, delay_between_attempts=5):
    attempt = 0
    while attempt < max_attempts:
        try:
            result = model.transcribe(file_name, language="Portuguese")
            transcribed_text = result["text"]
            with open(output_file, 'w', encoding='utf-8') as f_out:
                f_out.write(transcribed_text)
            return output_file, os.path.getsize(file_name)
        except Exception as e:
            attempt += 1
            print(f"Erro ao transcrever o arquivo {file_name} na tentativa {attempt}. Erro: {e}")
            if attempt < max_attempts:
                print(f"Aguardando {delay_between_attempts} segundos antes da próxima tentativa...")
                time.sleep(delay_between_attempts)
            else:
                print(f"Erro ao transcrever o arquivo após {max_attempts} tentativas. Pulando...")
                return None, 0

def main():
    drive.mount('/content/drive')
    drive_base_folder = "/content/drive/My Drive/Transcriptions"
    model = whisper.load_model("large")
    root_dir = "./sample_data/sections"

    if not os.path.exists(root_dir):
        os.makedirs(root_dir)

    total_files = count_total_files(root_dir)
    audio_weight_total = calculate_total_audio_weight(root_dir)
    completed_files = 0
    audio_weight_transcribed = 0
    processed_files = set()

    for subdir in os.listdir(root_dir):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            for file_name in os.listdir(subdir_path):
                if file_name.endswith(".mp3"):
                    input_file = os.path.join(subdir_path, file_name)
                    base_name = os.path.splitext(file_name)[0]
                    output_file = os.path.join(subdir_path, f"{base_name}.txt")

                    if base_name not in processed_files:
                        transcription_path, new_weight = transcribe_audio(model, input_file, output_file)
                        if transcription_path:
                            upload_file_to_drive(transcription_path, root_dir, drive_base_folder)
                            audio_weight_transcribed += new_weight
                            progress = (audio_weight_transcribed / audio_weight_total) * 100
                            print(f"Progresso: {progress:.2f}%, {completed_files + 1}/{total_files} arquivos transcritos")
                            completed_files += 1
                            processed_files.add(base_name)
                            with open("transcription_log.txt", "a") as log_file:
                                log_file.write(f"Transcrição completa para o arquivo: {input_file}\n")

                        mem = psutil.virtual_memory()
                        while mem.available < 1 * (1024 ** 3):
                            print(f"Memória baixa ({mem.available / (1024.0 ** 3):.2f} GB). Esperando...")
                            time.sleep(10)
                            mem = psutil.virtual_memory()

def upload_file_to_drive(filepath, root_dir, drive_base_folder):
    relative_path = os.path.relpath(filepath, root_dir)
    drive_dest_path = os.path.join(drive_base_folder, relative_path)
    drive_dest_dir = os.path.dirname(drive_dest_path)
    if not os.path.exists(drive_dest_dir):
        os.makedirs(drive_dest_dir)
    shutil.copy(filepath, drive_dest_path)
    os.remove(filepath)

def count_total_files(directory):
    total_files = 0
    for root, _, files in os.walk(directory):
        total_files += sum(1 for file in files if file.endswith('.mp3'))
    return total_files

def calculate_total_audio_weight(root_dir):
    total_audio_weight = 0
    for subdir in os.listdir(root_dir):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            for file_name in os.listdir(subdir_path):
                if file_name.endswith(".mp3"):
                    input_file = os.path.join(subdir_path, file_name)
                    file_weight = os.path.getsize(input_file)
                    total_audio_weight += file_weight
    return total_audio_weight

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


