In [1]:
# import required libraries

# utilities
from IPython.display import Markdown as md
import librosa
import numpy as np
import random
import pandas as pd
import os

# stats
from scipy.stats import norm

#Clustering models
from sklearn.cluster import KMeans
from kneed import KneeLocator

# plotting
import matplotlib.pyplot as plt


In [3]:
# Initialize common variables
num_trials = 20
seed_value = 123467

# set random seed
random.seed(seed_value)
np.random.seed(seed_value)

############################## Raw Data ##########################################################
# read the dataset - based on the outcome of data processing & basic feature engineering
# Set file paths
root_path = "D:/PhD Program/Final Research/Dissertation/Data/Model Datasets/"
data_file_path = root_path + "DS_1_Feature_MFCC_no_hilbert_trans.csv"
os.chdir(root_path)

feat_df = pd.read_csv(data_file_path)

# Inital class variable
y_char,unique_vals = pd.factorize(feat_df['sound_file_class'], sort=True)

In [4]:
# EDA 1: Extract key features and its correponding value ranges

cols_format_to_extract = ['f0','plp','power','rms','spect_centroid','spect_rolloff','spect_flat','spect_bw','spect_contrast',
                          'tempo','zcr']
#measure_names = ['mean','median','max','min','sd','iqr']
measure_names = ['mean','median','max','min']
class_names = [1,2,3,4]
class_name_dict = {'Q1':1,
                  'Q2':2,
                  'Q3':3,
                  'Q4':4}
#feat_summary_df = pd.DataFrame(columns = ['class','feature','mean','median','max','sd','iqr'])
feat_summary_df = pd.DataFrame(columns = ['class','feature','mean','median','max'])
cols_to_ignore = []
for i in range(len(class_names)):
    #print(i+1)
    for key, val in class_name_dict.items():
        if val == i+1:
            class_val = key
            
    eda_df = feat_df[feat_df.sound_file_class==class_val]
    for col_name in cols_format_to_extract:
        val_ranges = []
        for measure_val in measure_names:
            #print(final_col_name)
            final_col_name = str(col_name)+"_"+measure_val
            try:
                min_val = round(np.min(eda_df[final_col_name]),3)
            except Exception as e:
                min_val = 0
            try:
                max_val = round(np.max(eda_df[final_col_name]),3)
            except Exception as e:
                max_val = 0
                
            if min_val == max_val:
                cols_to_ignore.append(final_col_name)
            val_range = str(min_val) + " - "+ str(max_val)
            val_ranges.append(val_range)

        dict_tmp = {'feature':col_name,
                    'class':class_val,
                    'mean':val_ranges[0],
                    'median':val_ranges[1],
                    'max':val_ranges[2]}
        
    #print(dict_tmp)
        feat_summary_df = feat_summary_df.append(dict_tmp, ignore_index=True)

# Identity range of values for each feature in each class
#print(cols_to_ignore)
feat_summary_df.to_csv(root_path+"feature_summary.csv",index=False)
#feat_summary_df
cols_to_ignore = list(set(cols_to_ignore))

In [None]:
# EAD 2: plot normal distribution curve for mean value of core features 

figure, axis = plt.subplots(6, 2, figsize= (20,30))
full_feat_cols = [['f0_mean','plp_mean','power_mean','rms_mean','spect_centroid_mean','spect_rolloff_mean'],
                 ['spect_flat_mean','spect_bw_mean','spect_contrast_mean','tempo_mean','zcr_mean']]

# class - validate class overlap
color_vals = ['blue','purple','green','gold']
#mean_cols = list(X_means.columns)
feat_col_cnt = 0
for feat_cols in full_feat_cols:
    #print(feat_cols)
    row_cnt = 0
    for col_val in feat_cols:
        #print(col_val)
        cntr=0
        norm_df = feat_df[['sound_file_class',col_val]]

        for class_val in unique_vals:
            #print(class_val)
            class_df = norm_df[norm_df.sound_file_class==class_val]
            #print(class_df.head())
            mean_val = np.mean(class_df[col_val])
            sd_val = np.std(class_df[col_val])
            x = class_df[col_val]
            x_val = np.arange(max(x)*-1,max(x),0.001)
            label_val = str(class_val)+": N("+str(round(mean_val,1))+","+str(round(sd_val,1))+")"
            axis[row_cnt,feat_col_cnt].plot(x_val,norm.pdf(x_val,mean_val,sd_val),label = label_val, color=color_vals[cntr])    

            #plt.plot(x_val,norm.pdf(x_val,mean_val,sd_val),label = label_val, color=color_vals[cntr])    
            cntr +=1
        axis[row_cnt,feat_col_cnt].legend()
        axis[row_cnt,feat_col_cnt].set_ylabel("Density")
        axis[row_cnt,feat_col_cnt].set_xlabel(str(col_val))
        axis[row_cnt,feat_col_cnt].set_title("Normal Distributions - "+str(col_val))
        row_cnt +=1
    feat_col_cnt+=1
plt.show()

Comments:
1 Normal distribution plots indicate that there is significant class overlap between the classes Q3 and Q4
2 This class overlap can yield to lower accuracy in the model since the model may not be able to determine class boundaries
3 Kernel methods can be deployed to perform boundary detection but may not elevate the accuracy.
4 From a emotion detection stand point, this indicates 2 aspects:
    a Either the anotation of data was highly subjective OR
    b This overlap clearly indicates the bias involved where the detection of emotion boundaries varies with 
    the characterestics of each human brain. In addition, the classes Q3 and Q4 can be interpreted to be 
    inclined towards the "Sad" emotion as per Russel's Circumplex Model: (https://www.researchgate.net/profile/Jukka-Haekkinen/publication/262981399/figure/fig3/AS:392492835983380@1470588992866/Russells-circumplex-model-of-emotion.png).

Based on this perception, the class labels were re-classified into a ordinal variable with 
    a. "1" representing class "Q1"
    b. "2" representing class "Q2" and 
    c. "3" representing class "Q3" and "Q4".

In order to validate the 3 level re-classification of dependent variable, clustering (KMeans) is performed on the data to determine optimal clusters (elbow method) in the dataset using. MeanShift is also used as an additional method

In [None]:
# EDA 3: Kmeans cluster to validate classes

#figure, axis = plt.subplots(6, 2, figsize= (20,30))

X = feat_df[['f0_mean','rms_mean','sound_file_class_num']]
X_class = X[X.sound_file_class_num==1]
x = X_class.f0_mean
y_class = X_class.rms_mean
plt.scatter(x,y_class,color="gray", label="Q1")
X_class = X[X.sound_file_class_num==2]
x = X_class.f0_mean
y_class = X_class.rms_mean
plt.scatter(x,y_class,color="green", label="Q2")
X_class = X[X.sound_file_class_num==3]
x = X_class.f0_mean
y_class = X_class.rms_mean
plt.scatter(x,y_class,color="blue", label="Q3")
X_class = X[X.sound_file_class_num==4]
x = X_class.f0_mean
y_class = X_class.rms_mean
plt.scatter(x,y_class,color="yellow", label="Q4")

plt.plot(figsize=(20,30))
plt.ylabel("rms")
plt.xlabel("F0 - Fundamental Fequency")
plt.title("F0 vs rms")
plt.legend()
plt.show()

sse_cluster = []
for k in range(1,11,1):
    k_means_model = KMeans(n_clusters = k).fit(X)
    sse_cluster.append(k_means_model.inertia_)
plt.plot(range(1,11,1),sse_cluster)
plt.ylabel("sse")
plt.xlabel("Iteration")
plt.title("# of clusters")
plt.legend()
plt.show()

# Find elbow locator
l1 = KneeLocator(range(1,11), sse_cluster, curve='convex',direction = 'decreasing')
l1.elbow

In [None]:
md("Number of optimal clusters within the dataset are: {} clusters".format(l1.elbow))

In [None]:
# EDA 4: Meanshift clustering to validate clusters
from sklearn.cluster import MeanShift

clust_model  = MeanShift(bandwidth=65).fit(X)
print(clust_model)
 
# SSpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
#                    eigen_solver=None, eigen_tol=0.0, gamma=1.0,
#                    kernel_params=None, n_clusters=4, n_components=None,
#                    n_init=10, n_jobs=None, n_neighbors=10, random_state=None)

labels = clust_model.labels_
print(set(labels))
#plt.scatter(x[:,0], x[:,1], c=labels)
x = X.f0_mean
y_class = X.rms_mean
plt.scatter(x,y_class,c=labels)
plt.show()

-- Start the modelling
Start with AdaBoost
    - Determine optimal parameters using Grid Search
    - Once the optimal parameters are found, determine the model robustness using 
      multiple samples drawn from the train dataset and validating it against a test
      dataset which is extracted and kept as anonymous to the training process
    - Repeat the above steps for 6 different datasets viz. 
       1. raw dataset with full features, 
       2. hilbert transformed dataset with full features
       3. raw dataset with only "mean" features
       4. hilbert transformed data with only "mean" features
       5. raw dataset with a subset of "mean"features and 
       6. hilbert transformed data with a subset of "mean"features