audio filename format modality: channel-emotion-intensity-statement-repetition-actor.wav
- Modality -> 01=audio-video, 02=video, 03-audio
- Channel -> 01=speech, 02=song
- Emotion -> 01=neutral, 02=calm, 03=happy, 04=sad, 05=Angry, 06=Fearful, 07=Disgust, 08=Surprised
- Intensity -> 01=Normal, 02=Strong
- Statement -> 01=Kids are talking by the door, 02=Dogs are sitting by the door
- Repetition -> 01=first repetition, 02=second repetition
- Actor -> 01=first actor, ..., 24=twenty-fourth actor

The only information needed is emotion label as other data can be disregarded.

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

import pywt
import librosa
import librosa.display

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

import random
from random import seed, random, randint, sample
from scipy.signal import hilbert, chirp
from scipy.io import wavfile
from tqdm import tqdm

%matplotlib inline

In [2]:
# using speech data
speech_folder_name = './Audio_Speech_Actors_01-24/'
actors_folder_name = [os.path.join(speech_folder_name, actor) for actor in os.listdir(speech_folder_name)]
audio_files_path = [os.path.join(actor_num, file) for actor_num in actors_folder_name for file in os.listdir(actor_num)]
data = np.array([[file_path, int(file_path.split('\\')[-1].split('-')[2])-1] for file_path in audio_files_path])

labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
print(f"{len(audio_files_path)} Audio files fetched...\n")
labels_idx, count = np.unique(data[:, -1], return_counts=True)
for i in range(len(count)):
    print(f"{labels[int(labels_idx[i])]} -> {count[i]} samples.")


1440 Audio files fetched...

neutral -> 96 samples.
calm -> 192 samples.
happy -> 192 samples.
sad -> 192 samples.
angry -> 192 samples.
fearful -> 192 samples.
disgust -> 192 samples.
surprised -> 192 samples.


In [3]:
def add_awgn(audio):
    snr_db = np.random.uniform(15, 30)
    noise_std = np.sqrt(np.var(audio) / (10 ** (snr_db / 10)))
    gaussian_noise = np.random.normal(0, noise_std, len(audio))
    return audio + gaussian_noise

In [4]:
def preprocess_audio(audio):
    trimmed, idx = librosa.effects.trim(audio)
    norm_seq = (trimmed - np.mean(trimmed)) / np.std(trimmed)
    noisy = add_awgn(norm_seq)

    return norm_seq, noisy

In [5]:
def compute_wavelet_features(audio, label):
    wavelet = 'morl'
    sr = 16000
    widths = np.arange(1, 256)
    #print(f"Scales using: {widths}")

    dt = 1/sr
    frequencies = pywt.scale2frequency(wavelet=wavelet, scale=widths) / dt
    #print(f"Frequencies associated with the scales: {frequencies}")

    #creating filter to select frequencies between 20Hz and 5Khz - this is where most speech lies
    upper = [x for x in range(len(widths)) if frequencies[x] > 2000][-1]
    lower = [x for x in range(len(widths)) if frequencies[x] < 100][0]

    widths = widths[upper:lower]

    #computing wavelet transform 
    wavelet_coefs, freqs = pywt.cwt(audio, widths, wavelet=wavelet, sampling_period=dt)
    #print(f"shape of wavelet transform: {wavelet_coefs.shape}")

    # Fixed Segment Generation
    start = 0
    end = wavelet_coefs.shape[1]
    frames = []
    frame_size = 4000
    count = 0

    while start + frame_size <= end -1:
        f = (wavelet_coefs)[:, start:start+frame_size]
        assert f.shape[1] == frame_size
        frames.append(np.abs(f))
        start += frame_size

    frames = np.array(frames)
    frames = frames.reshape((len(frames), frame_size, wavelet_coefs.shape[0]))
    labels = np.ones(shape=(len(frames), 1))* int(label)

    return frames, labels


In [6]:
data = np.array([[file, int(file.split('\\')[-1].split('-')[2])-1] for file in audio_files_path])

In [7]:
x_train, x_, y_train, y_ = train_test_split(data[:, 0], data[:, -1], test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_, y_, test_size=0.25, random_state=42)
labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

print(f"Training: {x_train.shape}, labels: {y_train.shape}")
print(f"Validation: {x_val.shape}, labels: {y_val.shape}")
print(f"Testing: {x_test.shape}, labels: {y_test.shape}")

Training: (1008,), labels: (1008,)
Validation: (324,), labels: (324,)
Testing: (108,), labels: (108,)


In [8]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_val, return_counts=True))
print(np.unique(y_test, return_counts=True))


(array(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='<U61'), array([ 71, 124, 137, 134, 136, 146, 130, 130], dtype=int64))
(array(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='<U61'), array([18, 47, 42, 45, 41, 35, 45, 51], dtype=int64))
(array(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='<U61'), array([ 7, 21, 13, 13, 15, 11, 17, 11], dtype=int64))


In [9]:
test = x_train[0]
test_label = y_train[0]
seq, sr = librosa.load(test, sr=16000)
norm, noisy = preprocess_audio(seq)
segs, label = compute_wavelet_features(norm, test_label)
print(len(segs))

11


In [10]:
label.shape

(11, 1)

In [11]:
segs.shape

(11, 125, 4000)

In [17]:
# Set a seed for reproducibility
seed(42)

# Initialize lists to store data
x_train_wavelet = []
y_train_wavelet = []
uniq_id = []

# Iterate over individual labels
count = 0
num_rand_samp = 10

for label_index in range(len(labels)):
    label_indices = np.where(y_train == str(label_index))[0]
    selected_indices = sample(label_indices.tolist(), min(num_rand_samp, len(label_indices)))

    for audio_index in tqdm(selected_indices, desc=f"Label {label_index}"):
        current_sample = x_train[audio_index]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)

        for audio_type, audio_data in enumerate([normalised_audio, noisy_audio]):
            features, labelss = compute_wavelet_features(audio=audio_data, label=label_index)

            # Randomly sample from features
            indices = np.arange(len(features))
            selected_indices = sample(indices.tolist(), min(num_rand_samp, len(indices)))
            selected_features = features[selected_indices]

            # Update lists
            uniq_id += [count] * len(selected_features)
            y_train_wavelet.extend(y_train[audio_index])

            if count == 0:
                x_train_wavelet = selected_features
            else:
                x_train_wavelet = np.concatenate((x_train_wavelet, selected_features), axis=0)

            count += 1

print(f"X: {x_train_wavelet.shape}")

Label 0: 100%|██████████| 10/10 [00:19<00:00,  1.97s/it]
Label 1: 100%|██████████| 10/10 [00:24<00:00,  2.43s/it]
Label 2: 100%|██████████| 10/10 [00:24<00:00,  2.45s/it]
Label 3: 100%|██████████| 10/10 [00:31<00:00,  3.12s/it]
Label 4: 100%|██████████| 10/10 [00:31<00:00,  3.16s/it]
Label 5: 100%|██████████| 10/10 [00:38<00:00,  3.83s/it]
Label 6: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]
Label 7: 100%|██████████| 10/10 [00:50<00:00,  5.06s/it]

X: (1510, 125, 4000)





In [18]:
x_train_wavelet.shape

(1510, 125, 4000)

In [19]:
y_train_wavelet = np.array(y_train_wavelet)

In [20]:
np.unique(y_train_wavelet, return_counts=True)

(array(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='<U1'),
 array([20, 20, 20, 20, 20, 20, 20, 20], dtype=int64))

In [None]:
# saving training data locally
x_train_wavelet = []
y_train_wavelet = []

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_train == str(i))
    seed(i)
    ind = ind.tolist()

    for j in tqdm(range(len(x_train))):
        current_sample = x_train[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            y_train_wavelet.append(i)
            x_train_wavelet.append(F)
            

x_train_wavelet = np.array(x_train_wavelet)
print(f"X: {x_train_wavelet.shape}")

In [9]:
# saving training data locally
indices = []
x_train_wavelet = []
y_train_wavelet = []
uniq_id = []
count = 0

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_train == str(i))
    seed(i)
    ind = ind.tolist()
    ind = sample(ind, 50)
    audio_samples = x_train[ind]
    num_rand_samp = 50

    for j in tqdm(range(len(audio_samples))):
        current_sample = audio_samples[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            indices = np.arange(0, len(F), 1)
            indices = indices.tolist()
            indices = sample(indices, min(num_rand_samp, len(indices)))
            F = F[indices]
            uniq_id += [count] * len(F)
            y_train_wavelet.append(i)

            if count == 0 :
                x_train_wavelet = F
            else :
                x_train_wavelet = np.concatenate((x_train_wavelet, F), axis=0) 
            
            count += 1

print(f"X: {x_train_wavelet.shape}")

100%|██████████| 50/50 [02:14<00:00,  2.68s/it]
100%|██████████| 50/50 [02:44<00:00,  3.29s/it]
100%|██████████| 50/50 [02:36<00:00,  3.13s/it]
100%|██████████| 50/50 [03:03<00:00,  3.68s/it]
100%|██████████| 50/50 [03:11<00:00,  3.83s/it]
100%|██████████| 50/50 [03:38<00:00,  4.38s/it]
100%|██████████| 50/50 [04:08<00:00,  4.97s/it]
100%|██████████| 50/50 [03:56<00:00,  4.74s/it]

X: (40000, 157, 400)





In [10]:
x_train_wavelet = np.array(x_train_wavelet)

In [11]:
x_train_wavelet.shape

(40000, 157, 400)

In [12]:
y_train_wavelet = np.array(y_train_wavelet)
print("Y: ", y_train_wavelet.shape, " unique: ", np.unique(y_train_wavelet, return_counts=True))
# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/training_features", a=x_train_wavelet, b=y_train_wavelet)


Y:  (800,)  unique:  (array([0, 1]), array([400, 400], dtype=int64))


In [None]:
# saving training data locally
x_val_wavelet = []
y_val_wavelet = []

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_val == str(i))
    seed(i)
    ind = ind.tolist()

    for j in tqdm(range(len(x_val))):
        current_sample = x_val[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            y_val_wavelet.append(i)
            x_val_wavelet.append(F)
            
x_val_wavelet = np.array(x_val_wavelet)
print(f"X: {x_val_wavelet.shape}")

In [None]:
y_val_wavelet = np.array(y_val_wavelet)
print("Y: ", y_val_wavelet.shape, " unique: ", np.unique(y_val_wavelet, return_counts=True))
# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/val_features", a=x_val_wavelet, b=y_val_wavelet)


In [None]:
# saving training data locally
x_test_wavelet = []
y_test_wavelet = []

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_test == str(i))
    seed(i)
    ind = ind.tolist()

    for j in tqdm(range(len(x_test))):
        current_sample = x_test[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            y_test_wavelet.append(i)
            x_test_wavelet.append(F)
            
x_test_wavelet = np.array(x_test_wavelet)
print(f"X: {x_test_wavelet.shape}")

In [None]:
y_test_wavelet = np.array(y_test_wavelet)
print("Y: ", y_test_wavelet.shape, " unique: ", np.unique(y_test_wavelet, return_counts=True))
# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/test_features", a=x_test_wavelet, b=y_test_wavelet)


In [13]:
indices = []
x_val_wavelet = []
y_val_wavelet = []
uniq_id = []
count = 0

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_val == str(i))
    seed(i)
    ind = ind.tolist()
    ind = sample(ind, 50)
    audio_samples = x_val[ind]
    num_rand_samp = 50

    for j in tqdm(range(len(audio_samples))):
        current_sample = audio_samples[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            indices = np.arange(0, len(F), 1)
            indices = indices.tolist()
            indices = sample(indices, min(num_rand_samp, len(indices)))
            F = F[indices]
            uniq_id += [count] * len(F)
            y_val_wavelet.append(i)

            if count == 0 :
                x_val_wavelet = F
            else :
                x_val_wavelet = np.concatenate((x_val_wavelet, F), axis=0) 
            
            count += 1

print(f"X: {x_val_wavelet.shape}")

ValueError: Sample larger than population or is negative

In [None]:
# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/validation_features", a=x_val_wavelet, b=y_val_wavelet)

In [None]:
x_val_wavelet = [] # Store wavelet features. We have each sample into frames of length 400
y_val_wavelet = [] # Store class labels corresponding to wavelet features from an audio sample
uniq_id = []

for i in tqdm(range(len(x_val))) :

    curr_sample = x_val[i]
    seq, _ = librosa.load(curr_sample) 
    curr_target = y_val[i]
    F = compute_wavelet_features(seq)

    # Generate target labels corresponding to the frames of each sample
    y_val_wavelet += [curr_target] * len(F)
    uniq_id += [i] * len(F)

    if i == 0 :
        x_val_wavelet = F
    else :
        x_val_wavelet = np.concatenate((x_val_wavelet, F), axis=0) 

y_val_wavelet = np.array(y_val_wavelet) # Convert to numpy array
uniq_id = np.array(uniq_id)
print("X: ", x_val_wavelet.shape, "  y: ", y_val_wavelet.shape)

x_val_wavelet = x_val_wavelet.astype(np.float16)

# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/validation_features", a=x_val_wavelet, b=y_val_wavelet, c=uniq_id)


In [None]:
indices = []
x_test_wavelet = []
y_test_wavelet = []
uniq_id = []
count = 0

for i in range(len(labels)): #iterate over individual labels
    ind, = np.where(y_test == str(i))
    seed(i)
    ind = ind.tolist()
    ind = sample(ind, 50)
    audio_samples = x_test[ind]
    num_rand_samp = 50

    for j in tqdm(range(len(audio_samples))):
        current_sample = audio_samples[j]
        seq, _ = librosa.load(current_sample, sr=16000)
        normalised_audio, noisy_audio = preprocess_audio(audio=seq)
        for i in range(2):
            if i == 0:
                F = compute_wavelet_features(audio=normalised_audio)
                F = F.astype(np.float16)
            else:
                F = compute_wavelet_features(audio=noisy_audio)
                F = F.astype(np.float16)
            
            indices = np.arange(0, len(F), 1)
            indices = indices.tolist()
            indices = sample(indices, min(num_rand_samp, len(indices)))
            F = F[indices]
            uniq_id += [count] * len(F)
            y_test_wavelet.append(i)

            if count == 0 :
                x_test_wavelet = F
            else :
                x_test_wavelet = np.concatenate((x_test_wavelet, F), axis=0) 
            
            count += 1

print(f"X: {x_test_wavelet.shape}")

In [None]:
np.savez_compressed(os.getcwd()+"/testing_features", a=x_test_wavelet, b=y_test_wavelet)


In [None]:
"""x_test_wavelet = [] # Store wavelet features. We have each sample into frames of length 400
y_test_wavelet = [] # Store class labels corresponding to wavelet features from an audio sample
uniq_id = []

for i in tqdm(range(len(x_test))) :

    curr_sample = x_test[i]
    seq, _ = librosa.load(curr_sample) 
    curr_target = y_test[i]
    F = compute_wavelet_features(seq)

    # Generate target labels corresponding to the frames of each sample
    y_test_wavelet += [curr_target] * len(F)
    uniq_id += [i] * len(F)

    if i == 0 :
        x_test_wavelet = F
    else :
        x_test_wavelet = np.concatenate((x_test_wavelet, F), axis=0) 

y_test_wavelet = np.array(y_test_wavelet) # Convert to numpy array
uniq_id = np.array(uniq_id)
print("X: ", x_test_wavelet.shape, "  y: ", y_test_wavelet.shape)

x_test_wavelet = x_test_wavelet.astype(np.float16)

# Write all features to a .npz file
np.savez_compressed(os.getcwd()+"/testing_features", a=x_test_wavelet, b=y_test_wavelet, c=uniq_id)"""


In [3]:
data = np.load('./training_features.npz')

In [4]:
data['a'].shape

(40000, 157, 400)

In [6]:
np.unique(data['b'], return_counts=True)

(array([0, 1]), array([400, 400], dtype=int64))