In [1]:
import os
import torch
import torchaudio
from pydub import AudioSegment
from spleeter.separator import Separator

In [2]:
# Ensure FFmpeg is accessible
os.environ["PATH"] += os.pathsep + "C:\\ffmpeg\\bin"  # Adjust if needed

def load_audio(file_path):
    """Load and preprocess audio: convert to mono and 16kHz WAV."""
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        processed_audio = "temp_audio.wav"
        audio.export(processed_audio, format="wav")
        return processed_audio
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

In [3]:
def separate_voices(audio_file):
    """Separate vocals and accompaniment using Spleeter."""
    try:
        separator = Separator('spleeter:2stems')
        output_dir = "output_directory"
        os.makedirs(output_dir, exist_ok=True)
        separator.separate_to_file(audio_file, output_dir)
        print(f"Voices separated and saved in '{output_dir}'.")
        return os.path.join(output_dir, "vocals.wav"), os.path.join(output_dir, "accompaniment.wav")
    except Exception as e:
        print(f"Error in Spleeter: {e}")
        return None, None

In [4]:
def transcribe_audio(audio_file):
    """Transcribe audio using Wav2Vec 2.0."""
    try:
        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        model = bundle.get_model()
        waveform, sample_rate = torchaudio.load(audio_file)

        # Resample if necessary
        if sample_rate != bundle.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=bundle.sample_rate)
            waveform = resampler(waveform)

        # Transcription
        with torch.inference_mode():
            emissions, _ = model(waveform)
            tokens = torch.argmax(emissions, dim=-1)
            transcription = bundle.decode(tokens[0])

        return transcription
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

In [5]:
# Processing pipeline
input_audio = "segment_1.wav"  # Replace with actual file path
processed_audio = load_audio(input_audio)

if processed_audio:
    vocals, accompaniment = separate_voices(processed_audio)
    if vocals and accompaniment:
        speaker1_text = transcribe_audio(vocals)
        speaker2_text = transcribe_audio(accompaniment)

        print("\nTranscription Results:")
        print(f"Speaker 1 (Vocals): {speaker1_text}")
        print(f"Speaker 2 (Accompaniment): {speaker2_text}")

INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models\\2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Could not find trained model in model_dir: pretrained_models\2stems, running initialization to predict.
Instructions for updating:
Use output_s

In [1]:
import os
import torch
import torchaudio
import librosa
import soundfile as sf
from pydub import AudioSegment
from spleeter.separator import Separator

# Ensure FFmpeg is accessible
os.environ["PATH"] += os.pathsep + "C:\\ffmpeg\\bin"  # Adjust if needed

def load_audio(file_path):
    """Load and preprocess audio: convert to mono and 16kHz WAV."""
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        processed_audio = "temp_audio.wav"
        audio.export(processed_audio, format="wav")
        return processed_audio
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

def separate_voices(audio_file):
    """Separate vocals and accompaniment using Spleeter."""
    try:
        separator = Separator('spleeter:2stems')
        output_dir = "output_directory"
        os.makedirs(output_dir, exist_ok=True)
        separator.separate_to_file(audio_file, output_dir)

        # Use absolute paths to ensure correct file locations
        vocals_path = os.path.abspath(os.path.join(output_dir, "vocals.wav"))
        accompaniment_path = os.path.abspath(os.path.join(output_dir, "accompaniment.wav"))

        print(f"Voices separated and saved in '{output_dir}'.")
        return vocals_path, accompaniment_path
    except Exception as e:
        print(f"Error in Spleeter: {e}")
        return None, None

def transcribe_audio(audio_file):
    """Transcribe audio using Wav2Vec 2.0, using librosa for compatibility."""
    try:
        # Load Wav2Vec 2.0 ASR model
        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        model = bundle.get_model()

        # Load audio using librosa (instead of torchaudio)
        waveform, sample_rate = librosa.load(audio_file, sr=bundle.sample_rate)

        # Convert to torch tensor
        waveform = torch.tensor(waveform).unsqueeze(0)

        # Transcription
        with torch.inference_mode():
            emissions, _ = model(waveform)
            tokens = torch.argmax(emissions, dim=-1)
            transcription = bundle.decode(tokens[0])

        return transcription
    except Exception as e:
        print(f"Error transcribing audio {audio_file}: {e}")
        return None


In [2]:
# Processing pipeline
input_audio = "segment_1.wav"  # Replace with actual file path
processed_audio = load_audio(input_audio)

if processed_audio:
    vocals, accompaniment = separate_voices(processed_audio)
    if vocals and accompaniment:
        print("\nStarting transcription...")
        speaker1_text = transcribe_audio(vocals)
        speaker2_text = transcribe_audio(accompaniment)

        print("\nTranscription Results:")
        print(f"Speaker 1 (Vocals): {speaker1_text}")
        print(f"Speaker 2 (Accompaniment): {speaker2_text}")


INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models\\2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Could not find trained model in model_dir: pretrained_models\2stems, running initialization to predict.
Instructions for updating:
Use output_s

  waveform, sample_rate = librosa.load(audio_file, sr=bundle.sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error transcribing audio d:\Data Analysis\AliceBlue\output_directory\vocals.wav: [Errno 2] No such file or directory: 'd:\\Data Analysis\\AliceBlue\\output_directory\\vocals.wav'
Error transcribing audio d:\Data Analysis\AliceBlue\output_directory\accompaniment.wav: [Errno 2] No such file or directory: 'd:\\Data Analysis\\AliceBlue\\output_directory\\accompaniment.wav'

Transcription Results:
Speaker 1 (Vocals): None
Speaker 2 (Accompaniment): None


In [1]:
import os
import torch
import torchaudio
import librosa
import soundfile as sf
from pydub import AudioSegment
from spleeter.separator import Separator

# Ensure FFmpeg is accessible
os.environ["PATH"] += os.pathsep + "C:\\ffmpeg\\bin"  # Adjust if needed

def load_audio(file_path):
    """Load and preprocess audio: convert to mono and 16kHz WAV."""
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        processed_audio = "temp_audio.wav"
        audio.export(processed_audio, format="wav")
        return processed_audio
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

def separate_voices(audio_file):
    """Separate vocals and accompaniment using Spleeter."""
    try:
        separator = Separator('spleeter:2stems')
        output_dir = "output_directory"
        os.makedirs(output_dir, exist_ok=True)
        separator.separate_to_file(audio_file, output_dir)

        # Check actual files created by Spleeter
        subdirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))]
        if subdirs:
            output_dir = os.path.join(output_dir, subdirs[0])  # Update path to subdirectory

        vocals_path = os.path.abspath(os.path.join(output_dir, "vocals.wav"))
        accompaniment_path = os.path.abspath(os.path.join(output_dir, "accompaniment.wav"))

        print(f"\n✅ Voices separated successfully. Check: {output_dir}")
        print(f"🔹 Vocals Path: {vocals_path}")
        print(f"🔹 Accompaniment Path: {accompaniment_path}")

        # Verify files exist
        if not os.path.exists(vocals_path) or not os.path.exists(accompaniment_path):
            print("❌ Error: Separated audio files were not found. Check Spleeter output.")
            return None, None

        return vocals_path, accompaniment_path
    except Exception as e:
        print(f"Error in Spleeter: {e}")
        return None, None

def transcribe_audio(audio_file):
    """Transcribe audio using Wav2Vec 2.0, using librosa for compatibility."""
    try:
        # Load Wav2Vec 2.0 ASR model
        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        model = bundle.get_model()

        # Load audio using librosa
        if not os.path.exists(audio_file):
            print(f"❌ Error: File not found {audio_file}")
            return None

        waveform, sample_rate = librosa.load(audio_file, sr=bundle.sample_rate)

        # Convert to torch tensor
        waveform = torch.tensor(waveform).unsqueeze(0)

        # Transcription
        with torch.inference_mode():
            emissions, _ = model(waveform)
            tokens = torch.argmax(emissions, dim=-1)
            transcription = bundle.decode(tokens[0])

        return transcription
    except Exception as e:
        print(f"Error transcribing audio {audio_file}: {e}")
        return None


In [3]:
# Processing pipeline
input_audio = "segment_1.wav"  # Replace with actual file path
processed_audio = load_audio(input_audio)

if processed_audio:
    vocals, accompaniment = separate_voices(processed_audio)
    if vocals and accompaniment:
        print("\nStarting transcription...")
        speaker1_text = transcribe_audio(vocals)
        speaker2_text = transcribe_audio(accompaniment)

        print("\nTranscription Results:")
        print(f"Speaker 1 (Vocals): {speaker1_text}")
        print(f"Speaker 2 (Accompaniment): {speaker2_text}")


INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models\\2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Apply unet for vocals_spectrogram
INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:tensorflow:Do

In [1]:
import os
import torch
import torchaudio
import librosa
import soundfile as sf
from pydub import AudioSegment
from spleeter.separator import Separator

# Ensure FFmpeg is accessible
os.environ["PATH"] += os.pathsep + "C:\\ffmpeg\\bin"  # Adjust if needed

def load_audio(file_path):
    """Load and preprocess audio: convert to mono and 16kHz WAV."""
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        processed_audio = "temp_audio.wav"
        audio.export(processed_audio, format="wav")
        return processed_audio
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

def separate_voices(audio_file):
    """Separate vocals and accompaniment using Spleeter."""
    try:
        separator = Separator('spleeter:2stems')
        output_dir = "D:/Data Analysis/AliceBlue/output_directory/temp_audio"
        os.makedirs(output_dir, exist_ok=True)
        
        # Run Spleeter without creating subdirectories
        separator.separate_to_file(audio_file, output_dir)

        # Spleeter saves files as 'vocals.wav' and 'accompaniment.wav'
        vocals_path = os.path.join(output_dir, "vocals.wav")
        accompaniment_path = os.path.join(output_dir, "accompaniment.wav")

        # Check if files exist
        if os.path.exists(vocals_path) and os.path.exists(accompaniment_path):
            print(f"\n✅ Voices separated successfully. Check: {output_dir}")
            print(f"🔹 Vocals Path: {vocals_path}")
            print(f"🔹 Accompaniment Path: {accompaniment_path}")
        else:
            print(f"❌ Error: Separated audio files not found or named differently in {output_dir}")

        return vocals_path, accompaniment_path
    except Exception as e:
        print(f"Error in Spleeter: {e}")
        return None, None

def transcribe_audio(audio_file):
    """Transcribe audio using Wav2Vec 2.0, using librosa for compatibility."""
    try:
        # Load Wav2Vec 2.0 ASR model
        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        model = bundle.get_model()

        # Load audio using librosa
        if not os.path.exists(audio_file):
            print(f"❌ Error: File not found {audio_file}")
            return None

        waveform, sample_rate = librosa.load(audio_file, sr=bundle.sample_rate)

        # Convert to torch tensor
        waveform = torch.tensor(waveform).unsqueeze(0)

        # Transcription
        with torch.inference_mode():
            emissions, _ = model(waveform)
            tokens = torch.argmax(emissions, dim=-1)
            transcription = bundle.decode(tokens[0])

        return transcription
    except Exception as e:
        print(f"Error transcribing audio {audio_file}: {e}")
        return None




In [2]:
# Processing pipeline
input_audio = "REC30AirportTravel.wav"  # Replace with actual file path
processed_audio = load_audio(input_audio)

if processed_audio:
    vocals, accompaniment = separate_voices(processed_audio)
    if vocals and accompaniment:
        print("\nStarting transcription...")
        speaker1_text = transcribe_audio(vocals)
        speaker2_text = transcribe_audio(accompaniment)

        print("\nTranscription Results:")
        print(f"Speaker 1 (Vocals): {speaker1_text}")
        print(f"Speaker 2 (Accompaniment): {speaker2_text}")

INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models\\2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
INFO:tensorflow:Calling model_fn.
INFO:tensorfl

: 