# Heidelberg Digits Encoding with Mel-Spectrogram

In [None]:
import os
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
import numpy as np
import random
import pandas as pd
import re
from tqdm import trange
import copy

In [None]:
# to get to HD

# navigate to folder where https://zenkelab.org/datasets/hd_audio.tar.gz is downloaded and extracted

os.chdir("/its/home/ts468/data/rawHD/hd_audio/audio")

print("current cwd", os.getcwd())


In [None]:
cd = os.listdir()

sample_file = cd[random.randint(0, len(os.listdir()))]

print(sample_file)

# load audio file to be played
ipd.Audio(sample_file)

In [None]:
def to_mel_spectrogram(file_name, display = False):
    # load audio files with librosa
    signal, sr = librosa.load(file_name)

    # pre-emphasis filter
    alpha =  .95
    emphasized_signal = np.append(signal[0], signal[1:] - alpha * signal[:-1])
    
    # framing
    frame_size = .025
    frame_stride = .01

    frame_length, frame_step = frame_size * sr, frame_stride * sr  # Convert from seconds to samples
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    
    # Hamming window
    frames *= np.hamming(frame_length)
    
    # fast Fourier-Transform and Power Spectrum
    NFFT = 512
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
    
    # filter banks
    nfilt = 40
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))  # Convert Hz to Mel
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
    bin = np.floor((NFFT + 1) * hz_points / sr)

    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])   # left
        f_m = int(bin[m])             # center
        f_m_plus = int(bin[m + 1])    # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks = 20 * np.log10(filter_banks)  # dB
    
    # Crop or pad to 80 steps by repeating the last frame
    target_steps = 80
    current_steps = filter_banks.shape[0]
    if current_steps < target_steps:
        padding = np.tile(filter_banks[-1, :], (target_steps - current_steps, 1))
        filter_banks = np.vstack((filter_banks, padding))
    elif current_steps > target_steps:
        filter_banks = filter_banks[:target_steps, :]
        
    filter_banks = np.rot90(filter_banks)
    
    if display:
        # Display the filter banks with the 'viridis' colormap
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(filter_banks, x_axis='time', y_axis='mel', sr=sr, cmap='viridis')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel Filter Banks with Pre-Emphasis Filter (Cropped/Padded to 80 Steps)')
        plt.tight_layout()
        plt.show()
        
        print(filter_banks.shape)
    
    else:
        return filter_banks

In [None]:
to_mel_spectrogram(cd[random.randint(0, len(os.listdir()))], True)

In [None]:
# load a list of training audio files
train_files = []
with open("/its/home/ts468/data/rawHD/hd_audio/train_filenames.txt", "r") as file:
    for line in file:
        x = line[:-1]
        train_files.append(x)
        
# load a list of testing audio files
test_files = []
with open("/its/home/ts468/data/rawHD/hd_audio/test_filenames.txt", "r") as file:
    for line in file:
        x = line[:-1]
        test_files.append(x)

In [None]:
training_x_data = []
training_y_data = []
testing_x_data = []
testing_y_data = []

training_details = pd.DataFrame({'Language': [], 
                                 'Speaker': [], 
                                 'Trial': [], 
                                 'Label': [],
                                 'classification label': []})

testing_details = pd.DataFrame({'Language': [], 
                                 'Speaker': [], 
                                 'Trial': [], 
                                 'Label': [],
                                 'classification label': []})

# save all to a list
for i in trange(len(os.listdir())):
    split_values = re.split("[. _ -]", os.listdir()[i])
    
    if os.listdir()[i] in train_files:
        training_x_data.append(copy.deepcopy(to_mel_spectrogram(os.listdir()[i])))
        training_y_data.append(int(split_values[7]) if split_values[1] == "english" else int(split_values[7]) + 10)
        training_details.loc[len(training_details)] = {'Language': split_values[1], 
                                                       'Speaker': int(split_values[3]), 
                                                       'Trial': int(split_values[5]), 
                                                       'Label': int(split_values[7]),
                                                       'classification label': (int(split_values[7]) if split_values[1] == "english" else int(split_values[7]) + 10)}
    else:
        testing_x_data.append(copy.deepcopy(to_mel_spectrogram(os.listdir()[i])))
        testing_y_data.append(int(split_values[7]) if split_values[1] == "english" else int(split_values[7]) + 10)
        testing_details.loc[len(testing_details)] = {'Language': split_values[1], 
                                                     'Speaker': int(split_values[3]), 
                                                     'Trial': int(split_values[5]), 
                                                     'Label': int(split_values[7]),
                                                     'classification label': (int(split_values[7]) if split_values[1] == "english" else int(split_values[7]) + 10)}

# Save dataset


In [None]:
# navigate so location where encoded data is saved
os.chdir("/its/home/ts468/data/rawHD/experimental_1")
print("current cwd", os.getcwd())

In [None]:
np.save("training_x_data.npy", training_x_data)
np.save("training_y_data.npy", training_y_data)
np.save("testing_x_data.npy", testing_x_data)
np.save("testing_y_data.npy", testing_y_data)

training_details.to_csv('training_details.csv')  
testing_details.to_csv('testing_details.csv')  

In [None]:
os.listdir()