In [3]:
# !pip install librosa

import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from IPython.display import Audio
import librosa
import librosa.display

In [4]:
crema = r"D:/University/Courses/Capstone/CREMA-D/AudioWAV/"

directory_list = os.listdir(crema)

emotion = []
path = []

for file in directory_list:
    # storing the paths
    path.append(crema + file)
    # storing file emotions
    part=file.split('_')
    if part[2] == 'SAD':
        emotion.append('sad')
    elif part[2] == 'ANG':
        emotion.append('angry')
    elif part[2] == 'DIS':
        emotion.append('disgust')
    elif part[2] == 'FEA':
        emotion.append('fear')
    elif part[2] == 'HAP':
        emotion.append('happy')
    elif part[2] == 'NEU':
        emotion.append('neutral')
    else:
        emotion.append('Unknown')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(path, columns=['Path'])
crema_df = pd.concat([emotion_df, path_df], axis=1)
crema_df.head()

Unnamed: 0,Emotions,Path
0,angry,D:/University/Courses/Capstone/CREMA-D/AudioWA...
1,disgust,D:/University/Courses/Capstone/CREMA-D/AudioWA...
2,fear,D:/University/Courses/Capstone/CREMA-D/AudioWA...
3,happy,D:/University/Courses/Capstone/CREMA-D/AudioWA...
4,neutral,D:/University/Courses/Capstone/CREMA-D/AudioWA...


In [None]:
#Count of different emotions in the CREMA file

plt.title('Count of Emotions', size=16)
sns.countplot(crema_df.Emotions)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

In [None]:
#Function for creating a waveplot

def create_waveplot(data, sr, emo):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(emo), size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()
    
#Function for creating Spectrogram    

def create_spectrogram(data, sr, emo):
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(emo), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')   
    plt.colorbar()

def plot_wp_s(emo):
    path_temp = np.array(crema_df.Path[crema_df.Emotions==emo])[1]
    data, sampling_rate = librosa.load(path_temp)
    create_waveplot(data, sampling_rate, emo)
    create_spectrogram(data, sampling_rate, emo)
    Audio(path_temp)

In [None]:
plot_wp_s('fear')

In [None]:
plot_wp_s('angry')

In [None]:
plot_wp_s('sad')

In [None]:
plot_wp_s('happy')

In [None]:
plot_wp_s('disgust')

In [None]:
plot_wp_s('neutral')

## Feature Engineering

In [None]:
#Different augmentation techniques

def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    print(shift_range)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# taking any example and checking for techniques.
path_ex = np.array(crema_df.Path)[1]
data, sample_rate = librosa.load(path_ex)

In [None]:
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=data, sr=sample_rate)
Audio(path_ex)

2. Noise Injection

In [None]:
x = noise(data)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

3. Stretching

In [None]:
x = stretch(data)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

4. Shifting

In [None]:
x = shift(data)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

5. Pitch

In [None]:
x = pitch(data, sample_rate)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)

In [None]:
labels = {'disgust':0,'happy':1,'sad':2,'neutral':3,'fear':4,'angry':5}

Crema_df.replace({'Emotions':labels},inplace=True)

In [None]:
num_mfcc=13

n_fft=2048

hop_length=512

SAMPLE_RATE = 22050

data = {
        "labels": [],
        "mfcc": [],
        "zcr": [],
        "chroma_stft": [],
        "rms": [],
        "melspectrogram": []
    }

for i in range(7442):
    data['labels'].append(Crema_df.iloc[i,0])
    signal, sample_rate = librosa.load(Crema_df.iloc[i,1], sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(signal, sample_rate, n_mfcc=13, n_fft=2048, hop_length=512)
    mfcc = mfcc.T
    data["mfcc"].append(np.asarray(mfcc))
    zcr = librosa.feature.zero_crossing_rate(signal, sample_rate)
    zcr = zcr.T
    data["zcr"].append(np.asarray(zcr))
    chroma_stft = librosa.feature.chroma_stft(signal, sample_rate)
    chroma_stft = chroma_stft.T
    data["chroma_stft"].append(np.asarray(chroma_stft))
    rms = librosa.feature.rms(signal, sample_rate)
    rms = rms.T
    data["rms"].append(np.asarray(rms))
    melspectrogram = librosa.feature.melspectrogram(signal, sample_rate)
    melspectrogram = melspectrogram.T
    data["melspectrogram"].append(np.asarray(melspectrogram))

    if i%500==0:
        print(i)

## Saving Feature Dictionary

In [None]:
import pickle
with open('data_dict_all.pkl', 'wb') as f:
    pickle.dump(data, f)

## Load Feature Dictionary

In [None]:
import pickle
with open('data_dict_all.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

In [None]:
loaded_data.keys()

In [None]:
X = np.asarray(loaded_data["melspectrogram"])
y = np.asarray(loaded_data["labels"])

In [None]:
X = tf.keras.preprocessing.sequence.pad_sequences(X)
X.shape

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 123)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state =123)

In [None]:
print(X_train.shape,y_train.shape,X_validation.shape,y_validation.shape,X_test.shape,y_test.shape)

## Modeling

In [None]:
def build_model(input_shape):
    model = tf.keras.Sequential()
    model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(64)) 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(6, activation='softmax'))
    return model

In [None]:
# create network

input_shape = (None,12)
model = build_model(input_shape)

# compile model

optimiser = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimiser,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
model.summary()

In [None]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience = 10)
history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=512, epochs=150, callbacks=[es])

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy: ",test_acc)

## Saving Model

In [None]:
# serialize model to JSON
model_json = model.to_json()

with open("model_chroma_stft.json", "w") as json_file:
  json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model_chroma_stft.h5")
print("Saved model to disk")

In [None]:
print("Accuracy of our model on test data : " , model.evaluate(X_test,y_test)[1]*100 , "%")

epochs = [i for i in range(35)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss'] 

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

## Loading Model

In [None]:
from keras.models import model_from_json

# load json and create model
json_file = open('model_melspec.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model_mel_spec = model_from_json(loaded_model_json)

# load weights into new model
loaded_model_mel_spec.load_weights("model_melspec.h5")
print("Loaded model from disk")

# evaluate loaded model on test data
loaded_model_mel_spec.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Accuracy of our model on test data : " , loaded_model_mel_spec.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
predict_prob_melspec=loaded_model_mel_spec.predict(X_test)

In [None]:
predict_classes_melspec = np.argmax(predict_prob_melspec,axis=1)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predict_classes_melspec)

In [None]:
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")

    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn import metrics
import itertools

score = metrics.accuracy_score(y_test, predict_classes_melspec)*100
print("accuracy:   %0.3f" % score)

cm = metrics.confusion_matrix(y_test, predict_classes_melspec)
plot_confusion_matrix(cm, classes=['0', '1', '2', '3', '4', '5'])

In [None]:
print(classification_report(y_test, predict_classes_melspec, labels=[0, 1, 2, 3, 4, 5]))

## Appendix - CNN Model [ALTERNATIVE]

In [None]:
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [None]:
model=Sequential()

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(None, 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=8, activation='softmax'))

model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy' , metrics = ['accuracy'])

model.summary()