# Useful references:
- Audio feature extraction tutorial: http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/#deltas-and-delta-deltas
- Python feature extraction package documentation: https://github.com/jameslyons/python_speech_features

In [3]:
# Initial setup
import os
import numpy as np
from google.colab import drive
import wave
import pandas as pd
!pip install python_speech_features
import python_speech_features as ps
import os
import pickle

np.random.seed(1234)
drive.mount('/content/drive')
%cd "drive/My Drive/ptsa"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/ptsa


In [4]:
# Load meta data
meta_df = pd.read_csv("meta_data.csv")
meta_df.head()

Unnamed: 0,file_name,emotion,intensity,statement,repetition,actor,emotion_str,train
0,03-01-01-01-01-02-06.wav,0,1,1,2,6,neutral,False
1,03-01-01-01-02-02-06.wav,0,1,2,2,6,neutral,True
2,03-01-01-01-02-01-06.wav,0,1,2,1,6,neutral,True
3,03-01-01-01-01-01-06.wav,0,1,1,1,6,neutral,False
4,03-01-02-01-01-01-06.wav,1,1,1,1,6,calm,True


# Step 1: Extract raw features
- Log Mel-filterbank energy
- Deltas
- Delta-deltas

In [0]:
def read_file(filename):
    """
    Fetch bytes data, time elapsed and frame rate for a .wav file
    """
    file = wave.open(filename, 'r')    
    params = file.getparams()
    # Fetch parameters
    nchannels, sampwidth, framerate, nframes = params[:4]
    # Read and return a string of bytes
    str_data = file.readframes(nframes)
    wavedata = np.fromstring(str_data, dtype = np.short)
    time = np.arange(0, nframes) * (1.0 / framerate)
    file.close()
    return wavedata, time, framerate

def get_fixed_length(data, time, framerate, 
                     start = 0.5, end = 3.5, pad_value = 0.0):
    """
    Generate data with fixed duration
    """
    # Filter data to the specified range
    data_new = data[np.where((time > start) & (time <= end))]
    # Add padding when needed
    nframes = (end - start) * framerate
    if len(data_new) <= nframes:
        data_new = np.pad(data_new, (0, int(nframes - len(data_new))), "constant", constant_values=(pad_value))
    return data_new

In [14]:
path = "/content/drive/My Drive/ptsa/raw_data"
# Fix start and end time to remove silence
start = 0.5
end = 3.51
# Create placeholders for features
log_fbank_raw = []
delta_raw = []
delta_delta_raw = []

for subdir, dirs, files in os.walk(path):
    for file_name in files:
        if ".wav" in file_name:
            # Read bytes, time elapsed and frame rate
            data, time, framerate = read_file(os.path.join(subdir, file_name))
            # Create fixed length data
            data = get_fixed_length(data, time, framerate, start, end)
            # Compute log Mel-filterbank energy
            log_fbank = ps.logfbank(data, framerate, nfilt = 40, nfft = 1200)
            log_fbank_raw.append(log_fbank)
            # Compute delta
            delta = ps.delta(log_fbank, 2)
            delta_raw.append(delta)
            # Compute delta-delta
            delta_delta = ps.delta(delta, 2)
            delta_delta_raw.append(delta_delta)

  # This is added back by InteractiveShellApp.init_path()


In [0]:
# Convert lists to arrays
log_fbank_raw = np.array(log_fbank_raw)
delta_raw = np.array(delta_raw)
delta_delta_raw = np.array(delta_delta_raw)

# np.save("log_fbank_raw.npy", log_fbank_raw)
# np.save("delta_raw.npy", delta_raw)
# np.save("delta_delta_raw.npy", delta_delta_raw)

# Step 2: Compute mean and standard deviation of features using training data only

In [0]:
# log_fbank_raw = np.load("log_fbank_raw.npy")
# delta_raw = np.load("delta_raw.npy")
# delta_delta_raw = np.load("delta_delta_raw.npy")

In [0]:
# Compute mean and std
train_ind = list(meta_df[meta_df["train"]].index)

log_fbank_mean = np.mean(log_fbank_raw[train_ind], axis = 0)
log_fbank_std = np.std(log_fbank_raw[train_ind], axis = 0)

delta_mean = np.mean(delta_raw[train_ind], axis = 0)
delta_std = np.std(delta_raw[train_ind], axis = 0)

delta_delta_mean = np.mean(delta_delta_raw[train_ind], axis = 0)
delta_delta_std = np.std(delta_delta_raw[train_ind], axis = 0)

# Step 3: Normalize features

In [0]:
# Normalize features
log_fbank = (log_fbank_raw - log_fbank_mean)/log_fbank_std
delta = (delta_raw - delta_mean)/delta_std
delta_delta = (delta_delta_raw - delta_delta_mean)/delta_delta_std
# Combine features
X = np.stack((log_fbank, delta, delta_delta), axis = 3)

In [0]:
# Split into train and test
train_ind = list(meta_df[meta_df["train"]].index)
test_ind = list(meta_df[~meta_df["train"]].index)
np.random.shuffle(train_ind)
np.random.shuffle(test_ind)
X_train, X_test = X[train_ind], X[test_ind]
y_train, y_test = meta_df.loc[train_ind, "emotion"].values, meta_df.loc[test_ind, "emotion"].values

In [0]:
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)