In [6]:
import os
import numpy as np
import librosa
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

def extract_mel_spectrogram(audio_path, n_mels=128, n_fft=2048, hop_length=128):
    """
    Extract mel spectrogram from audio file with specific parameters.
    
    Args:
        audio_path (str): Path to audio file
        n_mels (int): Number of mel bands
        n_fft (int): FFT window size
        hop_length (int): Number of samples between successive frames
        
    Returns:
        np.ndarray: Normalized mel spectrogram of shape (128, 32)
    """
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None)
    
    # Extract mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length,
        fmin=20,
        fmax=sr/2,
        power=2.0
    )
    
    # Convert to log scale (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize to [0,1] range
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
    
    # Resize to fixed dimensions (128, 32)
    mel_spec_norm = tf.image.resize(mel_spec_norm[..., np.newaxis], (128, 32))
    mel_spec_norm = mel_spec_norm.numpy()
    mel_spec_norm = mel_spec_norm[..., 0]
    
    return mel_spec_norm

def process_audio_files(input_dir, output_dir):
    """
    Process all audio files in input directory and save features to output directory.
    
    Args:
        input_dir (str): Directory containing audio files
        output_dir (str): Directory to save extracted features
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all audio files
    audio_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.wav'):
                audio_files.append(os.path.join(root, file))
    
    # Initialize metadata list
    metadata = []
    
    # Process each audio file
    for audio_path in tqdm(audio_files):
        # Get relative path from input directory
        rel_path = os.path.relpath(audio_path, input_dir)
        
        # Create corresponding directory in output
        feature_dir = os.path.join(output_dir, os.path.dirname(rel_path))
        os.makedirs(feature_dir, exist_ok=True)
        
        # Extract mel spectrogram
        mel_features = extract_mel_spectrogram(audio_path)
        
        # Save features with same name but .npy extension
        feature_path = os.path.join(feature_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.npy')
        np.save(feature_path, mel_features)
        
        # Add to metadata
        metadata.append({
            'audio_path': rel_path,
            'feature_path': os.path.relpath(feature_path, output_dir),
            'mel_shape': mel_features.shape,
            'duration': librosa.get_duration(path=audio_path),
            'sr': librosa.get_samplerate(audio_path)
        })
    
    # Convert metadata to DataFrame and save
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(os.path.join(output_dir, 'metadata.csv'), index=False)
    
    # Print summary
    print("\nMetadata Summary:")
    print(f"Total files processed: {len(metadata_df)}")
    print("\nFeature shapes:")
    print(metadata_df['mel_shape'].value_counts())
    print("\nFirst few entries:")
    print(metadata_df.head())

if __name__ == "__main__":
    # Example usage with MCU-specific directories
    input_dir = "../data/processed_mcu"
    output_dir = "../data/features_mcu"
    process_audio_files(input_dir, output_dir)

100%|██████████| 701/701 [00:06<00:00, 110.42it/s]


Metadata Summary:
Total files processed: 701

Feature shapes:
mel_shape
(128, 32)    701
Name: count, dtype: int64

First few entries:
                      audio_path                   feature_path  mel_shape  \
0  do_am/do_am_speaker01_022.wav  do_am/do_am_speaker01_022.npy  (128, 32)   
1  do_am/do_am_speaker03_005.wav  do_am/do_am_speaker03_005.npy  (128, 32)   
2  do_am/do_am_speaker02_016.wav  do_am/do_am_speaker02_016.npy  (128, 32)   
3  do_am/do_am_speaker01_009.wav  do_am/do_am_speaker01_009.npy  (128, 32)   
4  do_am/do_am_speaker01_018.wav  do_am/do_am_speaker01_018.npy  (128, 32)   

   duration     sr  
0       1.0  16000  
1       1.0  16000  
2       1.0  16000  
3       1.0  16000  
4       1.0  16000  



