In [29]:
from pydub import AudioSegment
import os
import shutil
from pyannote.audio import Pipeline
from dotenv import load_dotenv
import torch
from pydub import AudioSegment
import math
from openai import OpenAI


In [2]:
def convert_audio_to_wav(input_file, output_file=None):
    """
    Converts an audio file of any supported format to WAV format using pydub, with error checking.
    
    :param input_file: Path to the input audio file (e.g., m4a, mp3, wav, ogg).
    :param output_file: Optional, path to save the converted wav file. 
                        If not provided, the output file will have the same name as the input but with a .wav extension.
    :return: Path to the converted WAV file or None if conversion fails.
    """
    # Check if ffmpeg is installed and available
    if not shutil.which("ffmpeg"):
        raise EnvironmentError("ffmpeg is not installed or not found in system PATH. Please install ffmpeg.")

    # Detect file format based on extension
    file_format = input_file.split('.')[-1].lower()

    # Supported formats by pydub + ffmpeg
    supported_formats = ['m4a', 'mp3', 'ogg', 'flv', 'aac', 'wma', 'flac', 'wav']

    # Check if the file format is supported
    if file_format not in supported_formats:
        raise ValueError(f"Unsupported file format: {file_format}. Supported formats are: {supported_formats}")

    try:
        # Try to load the audio file
        audio = AudioSegment.from_file(input_file, format=file_format)

        # If output file is not provided, generate the output file name
        if output_file is None:
            output_file = input_file.rsplit('.', 1)[0] + '.wav'

        # Export the audio file as wav
        audio.export(output_file, format="wav")
        return output_file

    except FileNotFoundError:
        print(f"Error: The input file '{input_file}' does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred while converting the audio file: {e}")
        return None


In [12]:
def perform_diarization(wav_file, hf_token):
    """
    Perform speaker diarization on a WAV audio file using pyannote.audio, with Hugging Face token for authentication.
    Utilizes MPS backend on Mac M1/M2 if available.
    
    :param wav_file: Path to the WAV file.
    :param hf_token: Hugging Face token for accessing the pre-trained model.
    :return: Diarization object with speaker segments and timestamps.
    """
    try:
        # Check if MPS is available, otherwise default to CPU
        device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
        
        # Load the speaker diarization pipeline using the provided Hugging Face token
        pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)

        # Perform speaker diarization on the specified device (MPS or CPU)
        diarization = pipeline(wav_file)

        # Collect the results in a structured format
        results = []
        for segment, _, speaker in diarization.itertracks(yield_label=True):
            result = {
                "speaker": speaker,
                "start_time": segment.start,
                "end_time": segment.end
            }
            results.append(result)
            print(f"Speaker {speaker} speaks from {segment.start:.1f}s to {segment.end:.1f}s")

        return results

    except Exception as e:
        print(f"An error occurred during diarization: {e}")
        return None




In [24]:
def split_audio_by_size(input_file, max_size_mb=25, overlap_ms=1000):
    """
    Splits an audio file into chunks small enough to send to the Whisper API, with some overlap to avoid cutting words.
    
    :param input_file: Path to the input WAV file.
    :param max_size_mb: Maximum size of each audio chunk in MB (25 MB for Whisper API).
    :param overlap_ms: Overlap duration between consecutive chunks in milliseconds.
    :return: List of file paths for each audio chunk.
    """
    # Load the audio file using pydub
    audio = AudioSegment.from_file(input_file)
    
    # Estimate how many bytes are in one second of the audio
    bytes_per_second = audio.frame_rate * audio.frame_width * audio.channels
    
    # Convert MB to bytes
    max_size_bytes = max_size_mb * 1024 * 1024
    
    # Calculate the chunk duration in milliseconds (chunk_duration = max_size / bytes_per_second)
    chunk_duration_ms = math.floor((max_size_bytes / bytes_per_second) * 1000)  # Convert seconds to milliseconds
    
    # Split the audio into chunks with overlap
    chunks = []
    start = 0
    while start < len(audio):
        end = start + chunk_duration_ms
        chunks.append(audio[start:end])
        start = end - overlap_ms  # Move to the next chunk with overlap
    
    # Save each chunk to a separate WAV file
    chunk_files = []
    for i, chunk in enumerate(chunks):
        chunk_filename = f"{os.path.splitext(input_file)[0]}_chunk_{i}.wav"
        chunk.export(chunk_filename, format="wav")
        chunk_files.append(chunk_filename)

    return chunk_files


In [42]:
def transcribe_audio_chunk(chunk_file, whisper_api_key):
    """
    Transcribe a single audio chunk using Whisper API.
    
    :param chunk_file: Path to the audio chunk (WAV file).
    :param whisper_api_key: OpenAI API key for Whisper API.
    :return: Transcription text of the audio chunk.
    """
    
    # Initialize the OpenAI client
    client = OpenAI(api_key=whisper_api_key)

    try:
        # Open the chunk file
        with open(chunk_file, "rb") as audio_file:
            # Send the audio file to Whisper API for transcription
            response = client.audio.transcriptions.create(
                file=audio_file,
                model="whisper-1",
            )
            return response.text

    
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None


In [22]:
def remove_overlap_from_transcriptions(transcriptions, overlap_word_count=5):
    """
    Remove overlapping parts of transcriptions by detecting repeated words at the boundary of consecutive chunks.
    
    :param transcriptions: List of transcribed text for each chunk.
    :param overlap_word_count: Number of words to compare at the end of one chunk and the start of the next.
    :return: Cleaned and combined transcription.
    """
    final_transcription = []
    
    for i in range(len(transcriptions)):
        # Add the first chunk directly
        if i == 0:
            final_transcription.append(transcriptions[i])
        else:
            # Compare the last few words of the previous chunk with the start of the current chunk
            previous_chunk = final_transcription[-1].split()
            current_chunk = transcriptions[i].split()
            
            # Find overlap by comparing the last few words of the previous chunk and the first few words of the current chunk
            overlap_start = 0
            for j in range(min(overlap_word_count, len(previous_chunk), len(current_chunk))):
                if previous_chunk[-(j+1):] == current_chunk[:(j+1)]:
                    overlap_start = j + 1
            
            # Add the non-overlapping part of the current chunk
            final_transcription.append(" ".join(current_chunk[overlap_start:]))
    
    return " ".join(final_transcription)


In [41]:
def transcribe_large_audio(input_file, whisper_api_key, max_size_mb=20, overlap_ms=1500, cleanup = False):
    """
    Transcribes a large audio file by splitting it into smaller chunks with overlap, sending each to Whisper API,
    and combining the results, removing overlap at chunk boundaries.
    
    :param input_file: Path to the input audio file (WAV).
    :param whisper_api_key: OpenAI API key for Whisper API.
    :param max_size_mb: Maximum size of each audio chunk in MB (25 MB for Whisper API).
    :param overlap_ms: Overlap duration between consecutive chunks in milliseconds.
    :return: Combined transcription text for the entire audio file.
    """
    # Step 1: Split the audio into chunks with overlap
    chunk_files = split_audio_by_size(input_file, max_size_mb, overlap_ms)

    # Step 2: Transcribe each chunk using Whisper API
    transcriptions = []
    for chunk_file in chunk_files:
        chunk_transcription = transcribe_audio_chunk(chunk_file, whisper_api_key)
        if chunk_transcription:
            transcriptions.append(chunk_transcription)
    
    # Step 3: Combine all transcriptions and remove overlap
    full_transcription = remove_overlap_from_transcriptions(transcriptions)
    
    # Optionally, clean up chunk files
    if cleanup:
        for chunk_file in chunk_files:
            os.remove(chunk_file)

    return full_transcription

Let's test with some test data

In [4]:
wave = convert_audio_to_wav("/Users/tim/Downloads/test.mp3")


'/Users/tim/Downloads/test.wav'

In [13]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
test=perform_diarization("/Users/tim/Downloads/test.wav", hf_token)

  std = sequences.std(dim=-1, correction=1)


Speaker SPEAKER_01 speaks from 0.0s to 47.3s
Speaker SPEAKER_01 speaks from 47.8s to 57.3s
Speaker SPEAKER_01 speaks from 59.2s to 84.9s
Speaker SPEAKER_01 speaks from 86.0s to 86.4s
Speaker SPEAKER_01 speaks from 87.7s to 96.6s
Speaker SPEAKER_01 speaks from 98.3s to 112.2s
Speaker SPEAKER_01 speaks from 112.4s to 117.0s
Speaker SPEAKER_01 speaks from 117.5s to 120.8s
Speaker SPEAKER_01 speaks from 121.0s to 121.5s
Speaker SPEAKER_00 speaks from 122.3s to 123.0s
Speaker SPEAKER_01 speaks from 123.0s to 123.0s
Speaker SPEAKER_00 speaks from 123.0s to 126.4s
Speaker SPEAKER_00 speaks from 128.0s to 130.7s
Speaker SPEAKER_00 speaks from 132.0s to 133.2s
Speaker SPEAKER_00 speaks from 134.8s to 135.0s
Speaker SPEAKER_01 speaks from 135.0s to 135.3s
Speaker SPEAKER_01 speaks from 137.7s to 141.6s
Speaker SPEAKER_01 speaks from 143.9s to 144.4s
Speaker SPEAKER_01 speaks from 146.0s to 147.4s
Speaker SPEAKER_01 speaks from 148.6s to 150.8s
Speaker SPEAKER_01 speaks from 151.9s to 156.4s
Spea

In [43]:
whisper_api_key = os.getenv("OPEN_API_TAELGAR")

# Path to your WAV file
input_file = "/Users/tim/Downloads/test.wav"

# Transcribe the large audio file with overlap handling
transcription = transcribe_large_audio(input_file, whisper_api_key, max_size_mb=15, overlap_ms=2000)

# Print the full transcription
print("Transcription result:\n", transcription)

Transcription(text="and holds it for a minute, concentrating. And as he does, it looks like he is expending a lot of energy. And the black, like little tendrils of black smoke begin to creep out of the crown and start to touch the mask and start to sort of billow into the mask. And then he turns and he shoves it on this person's face. And they briefly struggle for a minute, but not really. It's more like reflexes, not that there's some sort of conscious person who is fighting against it, but just the reflex of a body. If you were tied up and somebody was pouring water down your throat and you were choking, even if you didn't care, your reflexes would make you cough. And over the next three, four minutes, the mask sort of fuses into this person's face. And the shadow, the darkness envelops them. They begin to grow slightly. Out of their hand, a spear begins to grow and form. A cloak begins to form over their head and arms and draping down. They begin to become taller. The bonds fall awa

AttributeError: 'Transcription' object has no attribute 'split'