In [1]:
from __future__ import division, print_function

import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf
from tensorflow.keras import Model, layers, optimizers


import params as yamnet_params
import yamnet as yamnet_model
import features as features_lib
from random import shuffle

import librosa
import json
import glob

from tqdm import tqdm


In [None]:
print("TensorFlow version:", tf.__version__)
tf.config.list_physical_devices('GPU')

In [3]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
intense = {"약함" : 0.,"보통":1.,"강함":2.}
emotion_enc = {"기쁨" : 0, "사랑스러움" : 1, "두려움" : 2, "화남" : 3, "슬픔" : 4, "놀라움" : 5, "없음" : 6}
emotion_dec = { 0 : "기쁨", 1 : "사랑스러움", 2 : "두려움", 3 : "화남", 4 : "슬픔", 5 : "놀라움", 6 : "없음"}

In [8]:
def load_split_data_from_json(root_path):
    label_path = os.path.join(root_path, "label")
    data_path = os.path.join(root_path, "data")
    sr = 16000
    json_list = glob.glob(os.path.join(label_path,'*.json'))[:10]
    for path in json_list:
        with open(path, 'r') as f:
            data = json.load(f)
        
        wav_name = os.path.join(data_path, os.path.basename(path).split(".")[0] + ".wav")
        audio, _ = librosa.load(wav_name, sr=sr)

        text = data['text']
        emotion_category = data["emotion_category"]
        emotion_intense = data["emotion_intense"]
        
        yield np.array(audio), emotion_category, emotion_intense

        #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

In [9]:
train_path = "D:/emotion_split/train"

In [10]:
dataset = tf.data.Dataset.from_generator(
    lambda: load_split_data_from_json(train_path),
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.float32)
    )
)

In [11]:
def preprocess(audio_segment, emotion_category, emotion_intensity):
    labels = {
        'emotion_class_output': emotion_category,
        'emotion_intensity_output': emotion_intensity
    }
    return audio_segment, labels

In [12]:
dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=100)
batch_size = 16

dataset = dataset.padded_batch(
    batch_size,
    padded_shapes=([None, ], {'emotion_class_output': [], 'emotion_intensity_output': []}),
    padding_values=(0.0, {'emotion_class_output': 0, 'emotion_intensity_output': 0.0})
)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
def prediction_model(input_shape, num_classes):
    """Defines the prediction model for emotion classification and intensity regression."""
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    emotion_class_output = layers.Dense(num_classes, activation='softmax', name='emotion_class_output')(x)
    emotion_intensity_output = layers.Dense(1, activation='linear', name='emotion_intensity_output')(x)
    model = Model(inputs=inputs,  outputs= [emotion_class_output, emotion_intensity_output], name='emotion_recognition_model')
    return model

In [None]:
params = yamnet_params.Params()

embedding_model = yamnet_model.yamnet_embedding_model(params, "yamnet.h5")

for layer in embedding_model.layers:
    layer.trainable = False

num_classes = len(emotion_enc)
prediction_net = prediction_model(input_shape=(1024,), num_classes=num_classes)

waveform_input = embedding_model.inputs[0]
embeddings = embedding_model.outputs[0]
#waveform_input = layers.Input(shape=(None,), dtype=tf.float32, name='waveform_input')
#embeddings = embedding_model(waveform_input)
predictions = prediction_net(embeddings)

model = Model(inputs=waveform_input, outputs={"emotion_class_output": predictions[0],
             "emotion_intensity_output": predictions[1]}, name='emotion_recognition_model')

In [None]:
class SaveModelAtEpochEnd(tf.keras.callbacks.Callback):
    def __init__(self, save_dir):
        super(SaveModelAtEpochEnd, self).__init__()
        self.save_dir = save_dir
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

    def on_epoch_end(self, epoch, logs=None):
        epoch_num = epoch + 1
        save_path = os.path.join(self.save_dir, f'epoch_{epoch_num:02d}')
        os.makedirs(save_path, exist_ok=True)
        tf.saved_model.save(self.model, save_path)
        print(f'\nSaved model at epoch {epoch_num} to {save_path}')

In [15]:
epoch = 5

lr_schedule = optimizers.schedules.CosineDecay(
    initial_learning_rate=0.01,
    decay_steps=epoch
)

optimizer = optimizers.Adam(lr_schedule)

model.compile(
    optimizer=optimizer
    loss={
        'emotion_class_output': 'sparse_categorical_crossentropy',
        'emotion_intensity_output': 'mean_squared_error'
    },
    metrics={
        'emotion_class_output': 'accuracy',
        'emotion_intensity_output': 'mae'
    },
    run_eagerly=True
)

In [None]:
model.fit(dataset, epochs=epoch, verbose=1,callbacks=[save_callback])