<a href="https://colab.research.google.com/github/supertime1/Speech_Emotion_Recognition/blob/main/SER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.chdir('C:/Users/57lzhang.US04WW4008/PycharmProjects/Speech_Emotion_Recognition')
from data_handler import *
from audio_processor import AudioProcessor
import tensorflow as tf
import librosa.display
import matplotlib.pyplot as plt
import sklearn
from model.model_utils import decay
from tensorflow.keras import Model
from tensorflow.keras.layers import Conv2D, Conv1D, BatchNormalization, Input, Add, Activation, \
    MaxPooling1D, Dropout, Flatten, TimeDistributed, Bidirectional, Dense, LSTM, ZeroPadding1D, \
    AveragePooling1D, GlobalAveragePooling1D, Concatenate, Permute, Dot, Multiply, RepeatVector, \
    Lambda, Average, GlobalAveragePooling2D, DepthwiseConv2D, MaxPooling2D, ZeroPadding2D

In [2]:
 ##to overwrite NCCL cross device communication as this is running in Windows
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [2]:
# import data generator
raw_data_path = 'raw_data'
train_ratio = 0.9
val_ratio = 0.2
block_span = 1 # second
stride_span = 30 # millisecond
res_freq = 22050
random_seed = 10

data_handler = DataHandler(raw_data_path, train_ratio, val_ratio, 
                        res_freq, block_span, stride_span, random_seed)

In [3]:
sample_freq = res_freq
slice_span = 16 # millisecond
overlap_ratio = 3/4
n_mels = 64
audio_processor = AudioProcessor(sample_freq, slice_span, overlap_ratio, n_mels)

##visualize the preprocessing

In [5]:
train_gen = train_data_generator()
testing_data = next(train_gen)[0]

spec = audio_processor.spectrogram(testing_data)
librosa.display.specshow(spec, sr=sample_freq, x_axis='time', y_axis='linear');
plt.colorbar();
plt.title('Spectrogram')
plt.show()


mel_spec = audio_processor.mel_spectrogram(testing_data)
librosa.display.specshow(mel_spec, sr=sample_freq, x_axis='time', y_axis='linear');
plt.colorbar();
plt.title('Mel Spectrogram')
plt.show()

NameError: ignored

##Training

In [4]:
sample_data = np.random.rand((block_span*res_freq))
sample_mel, _ = audio_processor.spectrogram(sample_data, 1)
sample_mel = np.expand_dims(sample_mel, -1)
input_shape = sample_mel.shape
print(input_shape)

(257, 173, 1)


In [6]:
def simple_cnn(input_shape=None, dropout=0.2, classes=8):
    signal_input = Input(shape=input_shape)

    # 1st Conv2D
    x = Conv2D(8, (1, 1), strides=(1, 1), activation='relu')(signal_input)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)

    # 2nd Conv2D
    x = Conv2D(16, (3, 3), strides=(1, 1), activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(1, 1))(x)

    # 3rd Conv2D
    x = Conv2D(32, (3, 3), strides=(1, 1), activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(1, 1))(x)

    # 4th Conv2D
    x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(1, 1))(x)

    # 5th Conv2D
    x = Conv2D(16, (1, 1), strides=(1, 1), activation='relu')(x)
    # Full connection layer
    x = Flatten()(x)

    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    out = Dense(classes, activation='softmax')(x)
    model = Model(signal_input, out, name='cnn')

    return model

In [5]:
batch_size = 128
epochs = 100
train_filenames, train_num_samples = data_handler.get_filenames('data/train')
val_filenames, val_num_samples = data_handler.get_filenames('data/val')

def preprocess_dataset(files):
  files_ds = tf.data.Dataset.from_tensor_slices(files)
  output_ds = files_ds.map(data_handler.get_waveform_and_label, num_parallel_calls=tf.data.AUTOTUNE)
  output_ds = output_ds.map(audio_processor.get_spectrogram_tensor, num_parallel_calls=tf.data.AUTOTUNE)
  return output_ds

train_ds = preprocess_dataset(train_filenames)
val_ds = preprocess_dataset(val_filenames)
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)
train_ds = train_ds.cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(tf.data.AUTOTUNE)

Number of total examples: 79663
Example file tensor: tf.Tensor(b'data\\train\\3\\111_1983.wav', shape=(), dtype=string)
Number of total examples: 9307
Example file tensor: tf.Tensor(b'data\\val\\1\\8_58168.wav', shape=(), dtype=string)


In [6]:
## early stop
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                patience=20,
                                                restore_best_weights=True)
## learning rate decay callback
lr_schedule = tf.keras.callbacks.LearningRateScheduler(decay)
callback_list = [early_stop, lr_schedule]

In [7]:
model = tf.keras.models.Sequential([
Input(shape=input_shape),

Conv2D(32, 3, activation='relu'),
Conv2D(64, 3, activation='relu'),
MaxPooling2D(),
Dropout(0.25),
Flatten(),
Dense(128, activation='relu'),
Dropout(0.5),
Dense(8),
])
model = model
model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.CategoricalCrossentropy(),
                metrics=[tf.keras.metrics.Accuracy()])
history = model.fit(train_ds,
                    epochs=epochs,
                    validation_data=val_ds,
                    verbose=1,
                    callbacks=callback_list
                    )

Epoch 1/100


ResourceExhaustedError: ignored