### WAV to Mel spectrogram

In [2]:
import os
import numpy as np
import scipy.io.wavfile
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Constants
kPRE_EMPHASIS_COEFF = 0.97
sample_rate = 100000
max_duration = 0.9999583333333333
max_samples = int(max_duration * sample_rate)

def pad_signal(signal, target_len):
    num_zeros_needed = target_len - len(signal)
    if num_zeros_needed > 0:
        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(signal, (num_zeros_front, num_zeros_back), mode='constant')
    else:
        return signal

def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - kPRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

def save_mel_spectrogram(signal, sr, save_filename):
    S = librosa.feature.melspectrogram(y=signal.astype(float), sr=sr, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_dB, sr=sr)
    plt.axis('off')  # Disable axis
    plt.savefig(save_filename, bbox_inches='tight', pad_inches=0)
    plt.close()

def preprocess_and_save_as_image(wav_file_path, save_directory):
    _, signal = scipy.io.wavfile.read(wav_file_path)
    signal = pad_signal(signal, target_len=max_samples)
    emphasized_signal = pre_emphasis(signal)

    # Save Mel spectrogram as an image
    save_filename = os.path.join(save_directory, os.path.splitext(os.path.basename(wav_file_path))[0] + '_mel_spectrogram.jpg')
    save_mel_spectrogram(emphasized_signal, sample_rate, save_filename)

# Example usage
input_directory = '../../Data/3666'
output_directory = 'mel'
os.makedirs(output_directory, exist_ok=True)

for filename in os.listdir(input_directory):
    if filename.endswith('.wav'):
        wav_file_path = os.path.join(input_directory, filename)
        preprocess_and_save_as_image(wav_file_path, output_directory)


### Wav to raw mfcc

In [7]:
import os
import numpy as np
import scipy.io.wavfile
import librosa
import matplotlib.pyplot as plt
from skimage.transform import resize

# Constants
kPRE_EMPHASIS_COEFF = 0.97
sample_rate = 100000
max_duration = 0.9999583333333333
max_samples = int(max_duration * sample_rate)

def pad_signal(signal, target_len):
    num_zeros_needed = target_len - len(signal)
    if num_zeros_needed > 0:
        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(signal, (num_zeros_front, num_zeros_back), mode='constant')
    else:
        return signal

def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - kPRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

def pipeline(signal):
    emphasized_signal = pre_emphasis(signal)
    
    lifted_mfcc = librosa.feature.mfcc(
        y=emphasized_signal.astype(float), 
        sr=sample_rate, 
        n_mfcc=12, 
        dct_type=2, 
        norm='ortho', 
        lifter=22,
        n_fft=int(sample_rate * 0.025),
        hop_length=int(sample_rate * 0.01),
        power=2,
        center=False,
        window='hamming',
        n_mels=40
    )

    return lifted_mfcc

def save_mfcc_image(mfcc, save_filename):
    plt.figure(figsize=(10, 4))
    plt.imshow(mfcc, aspect='auto', cmap='viridis')
    plt.axis('off')  # Turn off axis
    plt.savefig(save_filename, bbox_inches='tight', pad_inches=0)
    plt.close()

def preprocess_and_save_as_image(wav_file_path, save_directory):
    _, signal = scipy.io.wavfile.read(wav_file_path)
    signal = pad_signal(signal, target_len=max_samples)
    emphasized_signal = pre_emphasis(signal)

    # Get MFCC
    mfcc = pipeline(emphasized_signal)

    # Resize for compatibility with a pre-trained ResNet-34
#     mfcc_3d = resize(np.rollaxis(np.array([mfcc] * 3), 0, 3), (224, 224, 3))

    # Save MFCC as an image
    save_filename = os.path.join(save_directory, os.path.splitext(os.path.basename(wav_file_path))[0] + '_mfcc.jpg')
    save_mfcc_image(mfcc, save_filename)

# Example usage
input_directory = '../../Data/3666'
output_directory = 'mfcc'
os.makedirs(output_directory, exist_ok=True)

for filename in os.listdir(input_directory):
    if filename.endswith('.wav'):
        wav_file_path = os.path.join(input_directory, filename)
        preprocess_and_save_as_image(wav_file_path, output_directory)


### WAV to mfcc 3d resized 

In [2]:
import os
import numpy as np
import scipy.io.wavfile
import librosa
from skimage.transform import resize
import matplotlib.pyplot as plt

# Constants
kPRE_EMPHASIS_COEFF = 0.97
sample_rate = 100000
max_duration = 0.9999583333333333
max_samples = int(max_duration * sample_rate)

def pad_signal(signal, target_len):
    num_zeros_needed = target_len - len(signal)
    if num_zeros_needed > 0:
        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(signal, (num_zeros_front, num_zeros_back), mode='constant')
    else:
        return signal

def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - kPRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

def pipeline(signal):
    emphasized_signal = pre_emphasis(signal)
    
    lifted_mfcc = librosa.feature.mfcc(
        y=emphasized_signal.astype(float), 
        sr=sample_rate, 
        n_mfcc=12, 
        dct_type=2, 
        norm='ortho', 
        lifter=22,
        n_fft=int(sample_rate * 0.025),
        hop_length=int(sample_rate * 0.01),
        power=2,
        center=False,
        window='hamming',
        n_mels=40
    )

    return lifted_mfcc

def resize_mfcc(mfcc, target_size=(224, 224, 3)):
    resized_mfcc = resize(mfcc, target_size[:2])
    return resized_mfcc

def preprocess_and_save_as_image(wav_file_path, output_directory):
    _, signal = scipy.io.wavfile.read(wav_file_path)
    signal = pad_signal(signal, target_len=max_samples)

    mfc = pipeline(signal)

    # Resize MFCC to the target size
    resized_mfcc = resize_mfcc(mfc)

    # Display the resized mfcc
    plt.imshow(resized_mfcc)
    plt.axis('off')
    
    # Build the full path to save the image
    save_path_mfcc = os.path.join(output_directory, f'{os.path.basename(wav_file_path)[:-4]}_mfcc.jpg')
    
    # Save the image
    plt.savefig(save_path_mfcc, bbox_inches='tight', pad_inches=0, transparent=True)
    
    # Close the plot to avoid memory issues
    plt.close()

# Example usage:
input_directory = '../../Data/3666'
output_directory = 'mfcc_handled'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.wav'):
        wav_file_path = os.path.join(input_directory, filename)
        preprocess_and_save_as_image(wav_file_path, output_directory)


### WAV to mfc final

In [1]:
import os
import numpy as np
import scipy.io.wavfile
import librosa
from skimage.transform import resize
import imageio

# Constants
kPRE_EMPHASIS_COEFF = 0.97
sample_rate = 100000
max_duration = 0.9999583333333333
max_samples = int(max_duration * sample_rate)

def pad_signal(signal, target_len):
    num_zeros_needed = target_len - len(signal)
    if num_zeros_needed > 0:
        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(signal, (num_zeros_front, num_zeros_back), mode='constant')
    else:
        return signal

def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - kPRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

def pipeline(signal):
    emphasized_signal = pre_emphasis(signal)
    
    lifted_mfcc = librosa.feature.mfcc(
        y=emphasized_signal.astype(float), 
        sr=sample_rate, 
        n_mfcc=12, 
        dct_type=2, 
        norm='ortho', 
        lifter=22,
        n_fft=int(sample_rate * 0.025),
        hop_length=int(sample_rate * 0.01),
        power=2,
        center=False,
        window='hamming',
        n_mels=40
    )

    return lifted_mfcc

def preprocess_and_save_as_image(wav_file_path, save_directory):
    _, signal = scipy.io.wavfile.read(wav_file_path)
    signal = pad_signal(signal, target_len=max_samples)
    
    # Display the original signal
    # Remove the display code

    # Display the signal after pre-emphasis
    # Remove the display code

    # Display the MFCC after processing
    # Remove the display code

    # Resize for compatibility with a pre-trained ResNet-34
    mfc_3d = resize(np.rollaxis(np.array([pipeline(signal)] * 3), 0, 3), (224, 224, 3))
    
    # Save the final image using imageio
    save_filename = os.path.join(save_directory, os.path.splitext(os.path.basename(wav_file_path))[0] + '.jpg')
    imageio.imwrite(save_filename, mfc_3d.astype(np.uint8))

# Example usage:
input_directory = '../../Data/AudioMNIST_Final'
output_directory = '../../Data_mfc/AudioMNIST'

for filename in os.listdir(input_directory):
    if filename.endswith('.wav'):
        wav_file_path = os.path.join(input_directory, filename)
        preprocess_and_save_as_image(wav_file_path, output_directory)
