In [125]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import scipy.signal

from scipy.signal import lfilter

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# A) Data Labelling and Pre-processing

In [127]:
import numpy as np
import pandas as pd
import os


'''
LABELS :-
 0 - ANGER
 1 - BORED
 2 - DISGUST
 3 - ANXIETY
 4 - HAPPY
 5 - SAD
 6 - Neutral

GENDER :-
  1 - MALE
  0 - FEMALE
'''

labels_encoded = {'W':0, 'L':1, 'E':2, 'A':3, 'F':4, 'T':5, 'N':6}
gender_encoded = {'03':1 , '10' : 1, '11':1 , '12':1 , '15':1, '08':0, '09':0, '13':0 , '14':0, '16':0}


In [128]:
labels = []
file_paths = []

In [129]:
wavefilePath="./wav"
for file_name in os.listdir(wavefilePath):
  if file_name.endswith('.wav'):
    file_paths.append(file_name)
    labels.append(labels_encoded[file_name[5]])
df = pd.DataFrame(
    {
        'file_path':file_paths,
        'label':labels,
    }
)

 ---

# B) Feature Extraction:

## Functions:

### Audio load function

In [130]:
def load_wav(wav):
    y, sr = sf.read(wav)
    return y,sr

### Zero-padded signal

In [131]:
def pad_signal(win_len, win_hop, y, size):
        
    num_frames = (size - win_len) // win_hop + 1   # Calculate the number of frames without padding
    
    padding_samples = max(0, (num_frames - 1) * win_hop + win_len - size)   # Calculate the number of samples needed for padding
    padded_signal = np.pad(y, (0, padding_samples), mode='constant')  # Zero pad the signal     
    num_frames_padded = (size + padding_samples - win_len) // win_hop + 1  # Recalculate the number of frames after padding
    
    return padded_signal, num_frames_padded


### N-th frame of signal

In [132]:
def nth_frame(n, win_len, win_hop, y): # n >= 0
    x = np.zeros(win_len)

    for i in range(0,win_len):
        x[i] = y[(win_hop * n) + i] # y is the zero-padded signal
    
    return x


### Computing &beta; parameter

#### a) For one frame

In [133]:
def beta(y, sr, fl, fh, fm):
    fft_fr = np.abs(np.fft.fft(y))
    # fft_fr = 20 * np.log10(fft_fr)
    freq = np.fft.fftfreq(len(fft_fr), 1/sr)

    n = len(freq)
    sum_l = 0
    sum_h = 0

    for i in range(0,n):
        if 0 <= freq[i] <= fl:
            sum_l = sum_l + (fft_fr[i]*fft_fr[i])

    for i in range(0,n):
        if fh <= freq[i] <= fm:
            sum_h = sum_h + (fft_fr[i]*fft_fr[i])

    return sum_h/sum_l

#### b) For entire signal

In [134]:
def beta_arr(y, sr, fl, fh, fm, win_len, win_hop, frames):
    
    b = np.zeros(frames)

    for i in range(0,frames):
        x = nth_frame(i, win_len, win_hop, y)
        b[i] = beta(x, sr, fl, fh, fm)

    # b_mean = sum(b)/len(b)
    b_mean = np.mean(b)
    b_std_dev = np.std(b)
    
    return b, b_mean, b_std_dev


### ZFF signal

In [135]:
def zff(y, sr, fr_len):
    size = len(y)
    x = np.zeros(size)


    for i in range(0,size):
        x[i] = y[i] - y[i-1]
    

    b = [1]
    a = [1, -2, 1]

    y1 = lfilter(b, a, x)
    y2 = lfilter(b, a, y1)

    N = int(((int)((sr*fr_len)/1000) - 1)/2)
    n = len(y2)
    y = np.zeros(n)

    x_nm = np.sum(y2[:2 * N + 1])

    for i in range(N+1, n):
        if i + N+1 > n-1:
            y[i] = y[i - 1]
        else:
            y[i] = y2[i] - (x_nm / (2*N + 1))
            x_nm = x_nm - y2[i - N] + y2[i + (N+1)]


    x_nm = np.sum(y[:2 * N + 1])

    y0 = np.zeros(n)

    for i in range(N+1, n):
        if i + N+1 > n-1:
            y0[i] = y0[i - 1]
        else:
            y0[i] = y[i] - (x_nm / (2*N + 1))
            x_nm = x_nm - y[i - N] + y[i + (N+1)]


    y0 /= np.max(y0)

    return y0

    

### $F_0$ calculation

In [136]:
def pitch(y, sr, fr_len):

    z = zff(y,sr,fr_len)
    t = np.arange(0,len(z))/sr
    n = len(z)

    A = []

    for i in range(0,n):

        if z[i] == 0 and 0 <= i <= n-2:
            if z[i-1] < z[i] < z[i+1]:
                A.append(t[i])
        
        else:
            if i <= n-2 and z[i+1] > 0 and z[i] < 0:
                x = (t[i] + t[i+1])/2
                A.append(x)

    n = len(A) - 1

    B = np.zeros(n)

    for i in range(0,n):
        B[i] = A[i+1] - A[i]
        B[i] = 1/B[i]

    b_mean = np.mean(B)
    b_std_dev = np.std(B)
    
    return B, b_mean, b_std_dev

### SoE calculation

In [137]:
def slope(x1, x2, y1, y2):
    return (y2-y1)/(x2-x1)

def SoE(y, sr, fr_len):

    z = zff(y,sr,fr_len)
    t = np.arange(0,len(z))/sr
    n = len(z)

    A = []

    for i in range(0,n):

        if z[i] == 0 and 0 <= i <= n-2:
            if z[i-1] < z[i] < z[i+1]:
                s = slope(t[i-1],t[i+1],z[i-1],z[i+1])
                A.append(s)
        
        else:
            if i <= n-2 and z[i+1] > 0 and z[i] < 0:
                s = slope(t[i],t[i+1],z[i],z[i+1])
                A.append(s)

    a_mean = np.mean(A)
    a_std_dev = np.std(A)

    
    return A, a_mean, a_std_dev

### GCI extraction

In [161]:
def gci(x):

    # Compute the first derivative of the speech signal
    speech_derivative = np.diff(x)

    # Identify zero-crossings in the derivative signal
    zero_crossings = np.where(np.diff(np.sign(speech_derivative)))[0]

    # Filter zero-crossings based on amplitude and spacing criteria
    gci_candidates = [zc for zc in zero_crossings if np.abs(speech_derivative[zc]) > 0.01]

    return gci_candidates

### Hilbert envelope extraction

In [165]:
def hilbert_env(x,sr,frame_length,frame_shift,order_lp):
    # LP analysis
    frames = librosa.effects.split(x, top_db=30, frame_length=int(frame_length * 0.001 * sr), hop_length=int(frame_shift * 0.001 * sr))
    lp_residuals = []

    for frame in frames:
        if(frame[1] > frame[0]):
            # LP analysis
            coefficients = librosa.lpc(x[frame[0]:frame[1]], order = order_lp)
            
            # LP residual
            lp_residual = np.convolve(coefficients, x[frame[0]:frame[1]], mode='full')[:len(x[frame[0]:frame[1]])]
            lp_residuals.append(lp_residual)

    if(len(lp_residuals) != 0):
        lp_residuals = np.concatenate(lp_residuals)

        # Hilbert envelope
        hilbert_envelope = np.abs(scipy.signal.hilbert(lp_residuals))

        return hilbert_envelope
    
    else:
        return []

### EoE extraction

In [166]:
def EoE(x,sr,frame_length,frame_shift,order_lp):
    gci_x = gci(x)
    gci_len = len(gci_x)

    EoE_arr = []

    for i in range(1,gci_len):
        y = x[gci_x[i]-22 : gci_x[i]+22]
        y1 = hilbert_env(y,sr,frame_length,frame_shift,order_lp)
        
        if(len(y1) != 0):
            EoE_arr.append(sum(y1**2))

    a_mean = np.mean(EoE_arr)
    a_std_dev = np.std(EoE_arr)
    
    return EoE_arr, a_mean, a_std_dev


---
# C) Training and Testing:

In [141]:
win_len = 512
win_hop = 256

fl = 550
fh = 800
fm = 4000


In [142]:
fr_len = 10 # in milliseconds

In [203]:
#Pitch feature extraction

pitch_list = []
pitch_label = []

x=1
for file in df['file_path']:
    y1,sr=librosa.load(f'./wav/{file}')
    size1 = np.size(y1)
    x1, frames1 = pad_signal(win_len, win_hop, y1, size1)
    b1, m1, sd1 = pitch(x1, sr, fr_len)
    pitch_list.append(b1)
    pitch_label.append(labels_encoded[file[5]])

# audio_label

In [204]:
# Padding the arrays:

len_pitch_list = len(pitch_list)
max_len_pl = -1
for i in range(0,len_pitch_list):
    max_len_pl = max(max_len_pl, len(pitch_list[i]))


max_length = max(len(arr) for arr in pitch_list)


for i in range(len(pitch_list)):
    current_length = len(pitch_list[i])
    if current_length < max_length:
        pitch_list[i] = np.concatenate((pitch_list[i], np.zeros(max_length - current_length)))
    pitch_list[i] = np.nan_to_num(pitch_list[i], nan=0)
    

In [342]:

X1 = pitch_list
Y1 = pitch_label
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1,Y1,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train1,Y_train1)
model.score(X_test1,Y_test1)
# X_train

0.5462962962962963

In [212]:
#Loudness feature extraction

beta_list = []
beta_label = []

x=1
for file in df['file_path']:
    y2,sr=librosa.load(f'./wav/{file}')
    size2 = np.size(y2)
    x2, frames2 = pad_signal(win_len, win_hop, y2, size2)
    b2, m2, sd2 = beta_arr(x2, sr, fl, fh, fm, win_len, win_hop, int(frames2))
    beta_list.append(b2)
    beta_label.append(labels_encoded[file[5]])


  return sum_h/sum_l


In [213]:
# Padding the arrays:

len_beta_list = len(beta_list)
max_len_bl = -1
for i in range(0,len_beta_list):
    max_len_bl = max(max_len_bl, len(beta_list[i]))


max_length = max(len(arr) for arr in beta_list)


for i in range(len(beta_list)):
    current_length = len(beta_list[i])
    if current_length < max_length:
        beta_list[i] = np.concatenate((beta_list[i], np.zeros(max_length - current_length)))
    beta_list[i] = np.nan_to_num(beta_list[i], nan=0)

In [344]:
X2 = beta_list
Y2 = beta_label
X_train2, X_test2, Y_train2, Y_test2=train_test_split(X2,Y2,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train2,Y_train2)
model.score(X_test2,Y_test2)
# X_train

0.46296296296296297

In [216]:
#SoE feature extraction

SoE_list = []
SoE_label = []

x=1
for file in df['file_path']:
    y3,sr=librosa.load(f'./wav/{file}')
    size3 = np.size(y3)
    x3, frames3 = pad_signal(win_len, win_hop, y3, size3)
    b3, m3, sd3 = SoE(x3, sr, fr_len)
    
    SoE_list.append(b3)
    SoE_label.append(labels_encoded[file[5]])


In [217]:
# Padding the arrays:

len_SoE_list = len(SoE_list)
max_len_sl = -1
for i in range(0,len_SoE_list):
    max_len_sl = max(max_len_sl, len(SoE_list[i]))


max_length = max(len(arr) for arr in SoE_list)


for i in range(len(SoE_list)):
    current_length = len(SoE_list[i])
    if current_length < max_length:
        SoE_list[i] = np.concatenate((SoE_list[i], np.zeros(max_length - current_length)))
    SoE_list[i] = np.nan_to_num(SoE_list[i], nan=0)

In [347]:
X3 = SoE_list
Y3 = SoE_label
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3,Y3,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train3,Y_train3)
model.score(X_test3,Y_test3)

0.42592592592592593

In [172]:
#EoE feature extraction

order_lp = 10
frame_length = 16  # in milliseconds
frame_shift = 2    # in milliseconds
region_around_gci = 2  # in milliseconds

EoE_list = []
EoE_label = []

x=1
cnt = 0
for file in df['file_path']:
        
        y4,sr=librosa.load(f'./wav/{file}')
        size4 = np.size(y4)
        x4, frames4 = pad_signal(win_len, win_hop, y4, size4)
        b4, m4, sd4 = EoE(x4,sr,frame_length,frame_shift,order_lp)
        
        EoE_list.append(b4)
        EoE_label.append(labels_encoded[file[5]])

# len(EoE_list[0])

In [173]:
# Padding the arrays:

len_EoE_list = len(EoE_list)
max_len_el = -1
for i in range(0,len_EoE_list):
    max_len_el = max(max_len_el, len(EoE_list[i]))


max_length = max(len(arr) for arr in EoE_list)


for i in range(len(EoE_list)):
    current_length = len(EoE_list[i])
    if current_length < max_length:
        EoE_list[i] = np.concatenate((EoE_list[i], np.zeros(max_length - current_length)))
    EoE_list[i] = np.nan_to_num(EoE_list[i], nan=0)

In [353]:
X4 = EoE_list
Y4 = EoE_label
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4,Y4,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train4,Y_train4)
model.score(X_test4,Y_test4)

0.3333333333333333

In [305]:
# Training for pitch and loudness together

pitch_beta_list = []

for i in range(len_pitch_list):
    pitch_beta_list.append(np.concatenate((pitch_list[i], beta_list[i])))

X12 = pitch_beta_list
Y12 = SoE_label
X_train12, X_test12, Y_train12, Y_test12 = train_test_split(X12,Y12,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train12,Y_train12)
model.score(X_test12,Y_test12)

0.44755244755244755

In [356]:
# Training for pitch and SoE together

pitch_SoE_list = []

for i in range(len_pitch_list):
    pitch_SoE_list.append(np.concatenate((pitch_list[i], SoE_list[i])))

X13 = pitch_SoE_list
Y13 = SoE_label
X_train13, X_test13, Y_train13, Y_test13 = train_test_split(X13,Y13,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train13,Y_train13)
model.score(X_test13,Y_test13)

0.4537037037037037

In [358]:
# Training for loudness and SoE together

beta_SoE_list = []

for i in range(len_pitch_list):
    beta_SoE_list.append(np.concatenate((beta_list[i], SoE_list[i])))

X23 = beta_SoE_list
Y23 = SoE_label
X_train23, X_test23, Y_train23, Y_test23 = train_test_split(X23,Y23,test_size=0.2)

np.random.seed(10)
model=RandomForestClassifier()
model.fit(X_train23,Y_train23)
model.score(X_test23,Y_test23)

0.49074074074074076

In [372]:
# Training for loudness and EoE together

beta_EoE_list = []

for i in range(len_pitch_list):
    beta_EoE_list.append(np.concatenate((beta_list[i], EoE_list[i])))

X24 = beta_EoE_list
Y24 = SoE_label
X_train24, X_test24, Y_train24, Y_test24 = train_test_split(X24,Y24,test_size=0.2)

np.random.seed(15)
model=RandomForestClassifier()
model.fit(X_train24,Y_train24)
model.score(X_test24,Y_test24)

0.4166666666666667

In [379]:
# Training for three features: pitch, beta and SoE together

full_list1 = []


for i in range(len_pitch_list):
    full_list1.append(np.concatenate((pitch_list[i], np.concatenate((beta_list[i], SoE_list[i])))))

X123 = full_list1
Y123 = SoE_label
X_train123, X_test123, Y_train123, Y_test123 = train_test_split(X123,Y123,test_size=0.2)

np.random.seed(10)

model=RandomForestClassifier()

model.fit(X_train123,Y_train123)
model.score(X_test123,Y_test123)

0.5370370370370371

In [361]:
# Training for three features: pitch, beta and EoE together

full_list2 = []


for i in range(len_pitch_list):
    full_list2.append(np.concatenate((pitch_list[i], np.concatenate((beta_list[i], EoE_list[i])))))

X124 = full_list2
Y124 = SoE_label
X_train124, X_test124, Y_train124, Y_test124 = train_test_split(X124,Y124,test_size=0.2)

np.random.seed(10)

model=RandomForestClassifier()

model.fit(X_train124,Y_train124)
model.score(X_test124,Y_test124)

0.4537037037037037

In [381]:
# Training for three features: pitch, SoE and EoE together

full_list3 = []


for i in range(len_pitch_list):
    full_list3.append(np.concatenate((pitch_list[i], np.concatenate((SoE_list[i], EoE_list[i])))))

X134 = full_list3
Y134 = SoE_label
X_train134, X_test134, Y_train134, Y_test134 = train_test_split(X134,Y134,test_size=0.2)

np.random.seed(15)

model=RandomForestClassifier()

model.fit(X_train134,Y_train134)
model.score(X_test134,Y_test134)

0.4351851851851852

In [363]:
# Training for all features

full_list4 = []


for i in range(len_pitch_list):
    full_list4.append(np.concatenate((pitch_list[i], np.concatenate((beta_list[i], np.concatenate((SoE_list[i], EoE_list[i])))))))

X1234 = full_list4
Y1234 = SoE_label
X_train1234, X_test1234, Y_train1234, Y_test1234 = train_test_split(X1234,Y1234,test_size=0.2)

np.random.seed(10)

model=RandomForestClassifier()

model.fit(X_train1234,Y_train1234)
model.score(X_test1234,Y_test1234)

0.5185185185185185

In [331]:
# # Parameters
# order_lp = 10
# frame_length = 16  # in milliseconds
# frame_shift = 2    # in milliseconds
# region_around_gci = 2  # in milliseconds

# x_load, sr = sf.read("03a01Fa.wav")
# ans, b, m = EoE(x_load,sr,frame_length,frame_shift,order_lp)
# np.max(ans)
# m

1.070886329595765

In [368]:
# z = model.predict(X_test1234)
# # print(z)

# # len(X_test1234)

# print(accuracy_score(Y_test1234, z))

0.5185185185185185
