In [None]:
!pip install openai-whisper --no-cache-dir
!pip install pydub
!pip install psutil
!pip install ffmpeg


import os
import whisper
import gc
from pydub import AudioSegment
import tempfile
import psutil
import time
from google.colab import drive
import shutil



In [None]:
def split_audio(file_name, chunk_length_ms=300000):
    audio = AudioSegment.from_file(file_name, format="mp3")
    return [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]


def split_and_transcribe_audio(model, file_name, output_file, chunk_length_ms=300000):
    chunks = split_audio(file_name, chunk_length_ms)
    audio_weight_transcribed = 0  # Variável para rastrear o peso dos arquivos de áudio transcritos

    with open(output_file, 'w', encoding='utf-8') as f_out:
        for chunk_num, audio_chunk in enumerate(chunks):
            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
            try:
                audio_chunk.export(temp_file.name, format="wav")
                retry_attempts = 3
                delay = 2
                success = False
                while retry_attempts > 0:
                    try:
                        result = model.transcribe(temp_file.name, language="Portuguese")
                        transcribed_text = result["text"]
                        f_out.write(transcribed_text)
                        # Obtém o tamanho do arquivo transcrição
                        transcribed_file_size = os.path.getsize(file_name)
                        # Atualiza o peso dos arquivos de áudio transcritos com o tamanho da transcrição
                        audio_weight_transcribed = transcribed_file_size
                        success = True
                        break
                    except MemoryError:
                        print(f"Erro de memória no chunk {chunk_num}. Tentando novamente em {delay} segundos...")
                        retry_attempts -= 1
                        time.sleep(delay)
                        delay *= 2
                    except Exception as e:
                        print(f"Erro no chunk {chunk_num}: {e}")
                        break

                if not success:
                    print(f"Não foi possível transcrever o chunk {chunk_num} após várias tentativas.")

                if chunk_num % 20 == 0 and chunk_num > 0:
                    gc.collect()
                    mem = psutil.virtual_memory()
                    while mem.available < 1 * (1024 ** 3):
                        print(f"Memória baixa ({mem.available / (1024.0 ** 3):.2f} GB). Esperando...")
                        time.sleep(10)
                        mem = psutil.virtual_memory()

            finally:
                temp_file.close()
                os.unlink(temp_file.name)

    # Retorna o caminho do arquivo de transcrição
    return output_file, audio_weight_transcribed

def main():
    drive.mount('/content/drive')
    drive_base_folder = "/content/drive/My Drive/Transcriptions"
    model = whisper.load_model("large")
    root_dir = "./sample_data/sections"

    if not os.path.exists(root_dir):
        os.makedirs(root_dir)

    total_files = count_total_files(root_dir)  # Calcula o número total de arquivos MP3
    completed_files = 0
    audio_weight_total = calculate_total_audio_weight(root_dir)  # Calcula o peso total dos arquivos de áudio
    audio_weight_transcribed = 0  # Peso dos arquivos de áudio transcritos
    processed_files = set()

    # Cria um arquivo de log para registrar os arquivos transcritos com sucesso
    log_file = open("transcription_log.txt", "a")

    for subdir in os.listdir(root_dir):
        subdir_path = os.path.join(root_dir, subdir)

        if os.path.isdir(subdir_path):
            for file_name in os.listdir(subdir_path):
                if file_name.endswith(".mp3"):
                    input_file = os.path.join(subdir_path, file_name)
                    base_name = os.path.splitext(file_name)[0]
                    output_file = os.path.join(subdir_path, f"{base_name}.txt")

                    if base_name not in processed_files:
                        transcription_path, new_weight = split_and_transcribe_audio(model, input_file, output_file)
                        upload_file_to_drive(transcription_path, root_dir, drive_base_folder)
                        audio_weight_transcribed += new_weight

                        # Atualiza o progresso com base no peso dos arquivos de áudio transcritos
                        progress = audio_weight_transcribed / (audio_weight_total / 100)
                        #
                        print(f"Progresso: {progress:.2f}%, {completed_files + 1}/{total_files} arquivos transcritos")

                        completed_files += 1
                        processed_files.add(base_name)

                        # Atualiza o arquivo de log após cada transcrição
                        with open("transcription_log.txt", "a") as log_file:
                            log_file.write(f"Transcrição completa para o arquivo: {input_file}\n")
                            log_file.flush()  # Força o flushing do buffer


def upload_file_to_drive(filepath, root_dir, drive_base_folder):

    # Recupera a estrutura de diretório relativa do arquivo em relação ao diretório raiz
    relative_path = os.path.relpath(filepath, root_dir)
    drive_dest_path = os.path.join(drive_base_folder, relative_path)

    # Cria os diretórios necessários no Google Drive
    drive_dest_dir = os.path.dirname(drive_dest_path)
    if not os.path.exists(drive_dest_dir):
        os.makedirs(drive_dest_dir)

    # Copia o arquivo para o destino no Google Drive
    shutil.copy(filepath, drive_dest_path)

    # Remove o arquivo original após a cópia
    os.remove(filepath)

def count_total_files(directory):
    total_files = 0
    for root, _, files in os.walk(directory):
        total_files += sum(1 for file in files if file.endswith('.mp3'))
    return total_files

def calculate_total_audio_weight(root_dir):
    total_audio_weight = 0
    for subdir in os.listdir(root_dir):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            for file_name in os.listdir(subdir_path):
                if file_name.endswith(".mp3"):
                    input_file = os.path.join(subdir_path, file_name)
                    file_weight = os.path.getsize(input_file)
                    total_audio_weight += file_weight
    return total_audio_weight

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
