In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns

from pydub import AudioSegment, effects
from pydub.generators import WhiteNoise
from pydub.playback import play
from pydub.utils import mediainfo
import librosa
from librosa import display   
import noisereduce as nr
import IPython.display as ipd
from IPython.display import Audio
from IPython.display import clear_output
import matplotlib.pyplot as plt
import pytz
import cv2

import tensorflow as tf
import tensorboard
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from json_tricks import dump, load

from DataModel import DataModel
from SERModel import SERModel
from Evaluation import Evaluation

from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("TensorFlow version:  ", tf.__version__)
print("TensorBoard version: ", tensorboard.__version__)

tz = pytz.timezone('Asia/Hong_Kong')

TensorFlow version:   2.7.0
TensorBoard version:  2.10.0


In [2]:
labelsToInclude = ['Anger', 'Frustration', 'Happiness', 'Neutral',  'Sadness']
mergeHappinessExcitement = True
splitDuration = 8
ignoreDuration = 2
transformByStft=True
hop_length = 512
win_length = 2048
n_mels = 128
onehot = False

# Data Augmentation Parameters
# multiply = 3
# pitchScaleSemitonesOffset=3.0
# timeStretchOffset=0.2
# randomGainOffset=0.2
# addNoiseMaxFactor=0.2

mixDataModel5LabelsSplit4Ignore2Stft = DataModel(labelsToInclude=labelsToInclude,
                                                mergeHappinessExcitement=mergeHappinessExcitement,
                                                splitDuration=splitDuration,
                                                ignoreDuration=ignoreDuration,
                                                transformByStft=transformByStft,
                                                hop_length=hop_length,
                                                win_length=win_length,
                                                n_mels=n_mels,
                                                onehot=onehot)
# mixDataModel5LabelsSplit4Ignore2Stft.extractIEMOCAPData()
mixDataModel5LabelsSplit4Ignore2Stft.extractEmoDBData()
mixDataModel5LabelsSplit4Ignore2Stft.processData()

Loading and Extracting EmoDB Data...
    Loaded and Extracted   407 data

Data Extration Completed
    Number of data: 407
      Neutral     : 79
      Frustration : 69
      Anger       : 126
      Sadness     : 62
      Happiness   : 71
      Excitement  : 0
      Surprise    : 0
      Disgust     : 0
      Fear        : 0
      Boredom     : 0

Splitting data...
Train Test Split Completed
    Training Size : 326
    Testing Size  : 81

Split or Add Padding for training data:
    Split Duration  : 8
    Ignore Duration : 2
Processing...
    Processed   326 data split and padding
Data Splitting and Padding For Training Completed!

Processing training data to Mel Spectrogram...
    Processed   326 Mel Spectrogram
Mel Spectrogram Processing For Training Completed
    Shape of training images: (326, 128, 251, 1)

Processing training labels...
Label Processing For Training Completed

Split or Add Padding for testing data
    Split Duration  : 8
    Ignore Duration : 2
Processing...
    Pr

In [5]:
model = SERModel('bestCNNModelLstmC', "jkl", input_shape=mixDataModel5LabelsSplit4Ignore2Stft.x_train[0].shape)

2023-01-20 02:26:37.778820: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.



########################################################
################### Training Section ###################
########################################################

Model Information:
    Model Choice     : bestCNNModelLstmC
    Experiment Name  : 01-20 02h26m37s jkl
    Log Directory    : /Users/alexto/Documents/Programming/HKU/FYP/Speech Emotion Recognition/IEMOCAP_ModelLog/01-20 02h26m37s jkl
    Result Directory : /Users/alexto/Documents/Programming/HKU/FYP/Speech Emotion Recognition/IEMOCAP_TrainedModel/01-20 02h26m37s jkl
    Optimizer        : adam
      Learning Rate  : 0.0001
      Decay          : 0.001
    Loss             : Sparse Categorical Crossentropy
    Metrics          : Accuracy



In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 63, 125, 120)      1200      
                                                                 
 batch_normalization (BatchN  (None, 63, 125, 120)     480       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 31, 62, 120)      0         
 )                                                               
                                                                 
 conv2d_2 (Conv2D)           (None, 29, 60, 256)       276736    
                                                                 
 batch_normalization_1 (Batc  (None, 29, 60, 256)      1024      
 hNormalization)                                                 
                                                        

### Experiment 53

In [None]:
def training53(dataModel, experimentName, modelName, epochs, early_stopping_patience, activation, optimizer, loss):
  ySize = len(dataModel.labels_name)
  learning_rate = 0.00005
  decay = 0.0005
  input_shape = dataModel.x_train[0].shape

  cnnModel = SERModel(modelName,
                      experimentName,
                      ySize=ySize,
                      optimizerChoice=optimizer,
                      learning_rate=learning_rate,
                      decay=decay,
                      lossChoice=loss,
                      input_shape=input_shape,
                      activation=activation)
  
  history = cnnModel.fit(dataModel.x_train, dataModel.y_train, epochs, dataModel.validation_percent, early_stopping_patience=early_stopping_patience)
  
  evaluation = Evaluation(dataModel, cnnModel.resultDir, cnnModel.logDir, model=cnnModel.model)
  evaluation.evaluateAllHistory(history)
  
  print('')
  print('File Name: ' + cnnModel.logDir.split('/')[-1])

  return evaluation

experimentName = "(Experiment53) Best CNN with LSTM (100 Epochs) (IEMOCAP EmoDB) (No Data Aug) (5 Emotions with Merge and Split 8 Ignore 2 STFT) (00005 lr 0005 decay Stop 3)"
modelName = "bestCNNModelLstm"
epochs = 100
early_stopping_patience = 3
activation = 'relu'
optimizer = 'adam'
loss = 'scce'

evaluation = training53(mixDataModel5LabelsSplit4Ignore2Stft, experimentName, modelName, epochs, early_stopping_patience, activation, optimizer, losse)