In [1]:
from __future__ import division, print_function

import sys
import os

import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf
from tensorflow.keras import Model, layers, optimizers


import params as yamnet_params
import yamnet as yamnet_model
import features as features_lib
from random import shuffle

import librosa
import json
import glob

from tqdm import tqdm


2024-10-09 21:47:37.650501: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
tf.config.list_physical_devices('GPU')

2024-10-09 21:47:40.538366: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-09 21:47:40.566396: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[]

In [9]:
intense = {"약함" : 0.,"보통":1.,"강함":2.}
emotion_enc = {"기쁨" : 0, "사랑스러움" : 1, "두려움" : 2, "화남" : 3, "슬픔" : 4, "놀라움" : 5, "없음" : 6}
emotion_dec = { 0 : "기쁨", 1 : "사랑스러움", 2 : "두려움", 3 : "화남", 4 : "슬픔", 5 : "놀라움", 6 : "없음"}

In [10]:
params = yamnet_params.Params()

In [11]:
def load_segments_from_json(paths, wav_prefix):
    for path in paths:
        with open(path, 'r') as f:
            data = json.load(f)
        sr = int(data['Wav']['SamplingRate'])
        tmp_prefix = os.path.join(wav_prefix, "indoor" if "indoor" in path else "outdoor")
        wav_name = os.path.join(tmp_prefix, data['File']['FileName'] + ".wav")
        audio, _ = librosa.load(wav_name, sr=sr)
        audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        #y_emotion_class = []
        #y_emotion_intensity = []
        #audios = []

        for entry in data['Conversation']:
            start_time = float(entry['StartTime'].replace(",",""))
            end_time = float(entry['EndTime'].replace(",",""))
            emotion_category = emotion_enc[entry['VerifyEmotionTarget']]
            emotion_intense = intense[entry['VerifyEmotionLevel']]
            start_sample = int(start_time * 16000)
            end_sample = int(end_time * 16000)
            audio_segment = audio_resampled[start_sample:end_sample]

            #audios.append(np.array(audio_segments))
            #y_emotion_class.append(emotion_category)
            #y_emotion_intensity.append(emotion_level)

            yield np.array(audio_segment), emotion_category, emotion_intense

        #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

In [12]:
path = "../jspark/emotion/train"
audio_source = "data"
label_str = "label"
indoor_ = ["indoor", "outdoor"] 
indoor_files = glob.glob(os.path.join(path,label_str,indoor_[0], '*.json'))
outdoor_files = glob.glob(os.path.join(path,label_str,indoor_[1], '*.json'))
full_data = indoor_files + outdoor_files
shuffle(full_data)

In [13]:
dataset = tf.data.Dataset.from_generator(
    lambda: load_segments_from_json(full_data, os.path.join(path,audio_source)),
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.float32)
    )
)

In [14]:
def preprocess(audio_segment, emotion_category, emotion_intensity):
    waveform_padded = features_lib.pad_waveform(audio_segment, params)
    #_, features = features_lib.waveform_to_log_mel_spectrogram_patches(
    #    waveform_padded, params)
    #num_patches = tf.shape(features)[1]
    #tf.print(tf.shape(features))
    #emotion_class_repeated = tf.repeat(emotion_category[tf.newaxis], num_patches, axis=0)
    #emotion_intensity_repeated = tf.repeat(emotion_intensity[tf.newaxis], num_patches, axis=0)
    labels = {
        'emotion_class_output': emotion_category,
        'emotion_intensity_output': emotion_intensity
    }
    #tf.print(tf.shape(emotion_class_repeated), len(labels["emotion_class_output"]))
    return audio_segment, labels

In [15]:
dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=1000)
batch_size = 32
dataset = dataset.padded_batch(
    batch_size,
    padded_shapes=([None, ], {'emotion_class_output': [], 'emotion_intensity_output': []}),
    padding_values=(0.0, {'emotion_class_output': 0, 'emotion_intensity_output': 0.0})
)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [16]:
def prediction_model(input_shape, num_classes):
    """Defines the prediction model for emotion classification and intensity regression."""
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    emotion_class_output = layers.Dense(num_classes, activation='softmax', name='emotion_class_output')(x)
    emotion_intensity_output = layers.Dense(1, activation='linear', name='emotion_intensity_output')(x)
    model = Model(inputs=inputs,  outputs= [emotion_class_output, emotion_intensity_output], name='emotion_recognition_model')
    return model

In [17]:
params = yamnet_params.Params()

embedding_model = yamnet_model.yamnet_embedding_model(params)

for layer in embedding_model.layers:
    layer.trainable = False

num_classes = len(emotion_enc)
prediction_net = prediction_model(input_shape=(1024,), num_classes=num_classes)

waveform_input = embedding_model.inputs[0]
embeddings = embedding_model.outputs[0]
predictions = prediction_net(embeddings)
#waveform_input = layers.Input(shape=(None,), dtype=tf.float32)
#embeddings = embedding_model(waveform_input)
#predictions = prediction_net(embeddings)

model = Model(inputs=waveform_input, outputs={"emotion_class_output": predictions[0],
             "emotion_intensity_output": predictions[1]}, name='emotion_recognition_model')

In [18]:
epoch = 30

lr_schedule = optimizers.schedules.CosineDecay(
    initial_learning_rate=0.01,
    decay_steps=epoch
)

optimizer = optimizers.Adam(lr_schedule)

model.compile(
    optimizer=optimizer,
    loss={
        'emotion_class_output': 'sparse_categorical_crossentropy',
        'emotion_intensity_output': 'mean_squared_error'
    },
    metrics={
        'emotion_class_output': 'accuracy',
        'emotion_intensity_output': 'mae'
    }
)

In [None]:
model.fit(dataset, epochs=epoch, verbose=1)

Epoch 1/30


2024-10-09 21:47:22.123421: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2024-10-09 21:47:22.123690: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2024-10-09 21:47:33.258862: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 75 of 1000


In [None]:
model.save('10epoch.h5')