In [2]:
from __future__ import division, print_function

import sys

import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf
from tf_keras import Model, layers


import params as yamnet_params
import yamnet as yamnet_model
import features as features_lib

import librosa
import json
import glob
import os
from tqdm import tqdm




In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(physical_devices)

[]


In [4]:
intense = {"약함" : 0.,"보통":1.,"강함":2.}
emotion_enc = {"기쁨" : 0, "사랑스러움" : 1, "두려움" : 2, "화남" : 3, "슬픔" : 4, "놀라움" : 5, "없음" : 6}
emotion_dec = { 0 : "기쁨", 1 : "사랑스러움", 2 : "두려움", 3 : "화남", 4 : "슬픔", 5 : "놀라움", 6 : "없음"}

In [5]:
params = yamnet_params.Params()

In [6]:
def load_segments_from_json(paths, wav_prefix):
    for path in paths:
        with open(path, 'r') as f:
            data = json.load(f)
        sr = int(data['Wav']['SamplingRate'])
        tmp_prefix = os.path.join(wav_prefix, "실내" if "실내" in path else "실외")
        wav_name = os.path.join(tmp_prefix, data['File']['FileName'] + ".wav")
        audio, _ = librosa.load(wav_name, sr=sr)
        audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        #y_emotion_class = []
        #y_emotion_intensity = []
        #audios = []

        for entry in data['Conversation']:
            start_time = float(entry['StartTime'].replace(",",""))
            end_time = float(entry['EndTime'].replace(",",""))
            emotion_category = emotion_enc[entry['VerifyEmotionTarget']]
            emotion_intense = intense[entry['VerifyEmotionLevel']]
            start_sample = int(start_time * 16000)
            end_sample = int(end_time * 16000)
            audio_segment = audio_resampled[start_sample:end_sample]

            #audios.append(np.array(audio_segments))
            #y_emotion_class.append(emotion_category)
            #y_emotion_intensity.append(emotion_level)

            yield np.array(audio_segment), emotion_category, emotion_intense

        #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

In [7]:
path = "D:/134-1.감정이 태깅된 자유대화 (성인)/01-1.정식개방데이터/Training"
audio_source = "01.원천데이터"
label_str = "02.라벨링데이터"
indoor_ = ["실내", "실외"] 
indoor_files = glob.glob(os.path.join(path,label_str,indoor_[0], '*.json'))
outdoor_files = glob.glob(os.path.join(path,label_str,indoor_[1], '*.json'))

In [8]:
dataset = tf.data.Dataset.from_generator(
    lambda: load_segments_from_json(indoor_files + outdoor_files, os.path.join(path,audio_source)),
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.float32)
    )
)

In [9]:
def preprocess(audio_segment, emotion_category, emotion_intensity):
    waveform_padded = features_lib.pad_waveform(audio_segment, params)
    #_, features = features_lib.waveform_to_log_mel_spectrogram_patches(
    #    waveform_padded, params)
    #num_patches = tf.shape(features)[1]
    #tf.print(tf.shape(features))
    #emotion_class_repeated = tf.repeat(emotion_category[tf.newaxis], num_patches, axis=0)
    #emotion_intensity_repeated = tf.repeat(emotion_intensity[tf.newaxis], num_patches, axis=0)
    labels = {
        'emotion_class_output': emotion_category,
        'emotion_intensity_output': emotion_intensity
    }
    #tf.print(tf.shape(emotion_class_repeated), len(labels["emotion_class_output"]))
    return audio_segment, labels

In [10]:
dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=1000)
batch_size = 1
dataset = dataset.padded_batch(
    batch_size,
    padded_shapes=([None, ], {'emotion_class_output': [], 'emotion_intensity_output': []}),
    padding_values=(0.0, {'emotion_class_output': 0, 'emotion_intensity_output': 0.0})
)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [20]:
def prediction_model(input_shape, num_classes):
    """Defines the prediction model for emotion classification and intensity regression."""
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    emotion_class_output = layers.Dense(num_classes, activation='softmax', name='emotion_class_output')(x)
    emotion_intensity_output = layers.Dense(1, activation='linear', name='emotion_intensity_output')(x)
    model = Model(inputs=inputs,  outputs= [emotion_class_output, emotion_intensity_output], name='emotion_recognition_model')
    return model

In [27]:
params = yamnet_params.Params()

embedding_model = yamnet_model.yamnet_embedding_model(params)

for layer in embedding_model.layers:
    layer.trainable = False

num_classes = len(emotion_enc)
prediction_net = prediction_model(input_shape=(1024,), num_classes=num_classes)

waveform_input = embedding_model.inputs[0]
embeddings = embedding_model.outputs[0]
predictions = prediction_net(embeddings)
#waveform_input = layers.Input(shape=(None,), dtype=tf.float32)
#embeddings = embedding_model(waveform_input)
#predictions = prediction_net(embeddings)

model = Model(inputs=waveform_input, outputs={"emotion_class_output": predictions[0],
             "emotion_intensity_output": predictions[1]}, name='emotion_recognition_model')

In [28]:
model.compile(
    optimizer='adam',
    loss={
        'emotion_class_output': 'sparse_categorical_crossentropy',
        'emotion_intensity_output': 'mean_squared_error'
    },
    metrics={
        'emotion_class_output': 'accuracy',
        'emotion_intensity_output': 'mae'
    }
)

In [29]:
for layer in model.layers:
    print(f"Layer Name: {layer.name}, Output Shape: {layer.output_shape}")

Layer Name: input_11, Output Shape: [(None,)]
Layer Name: tf.compat.v1.shape_10, Output Shape: (1,)
Layer Name: tf.__operators__.getitem_10, Output Shape: ()
Layer Name: tf.reshape_5, Output Shape: (None, None)
Layer Name: tf.compat.v1.shape_11, Output Shape: (2,)
Layer Name: tf.__operators__.getitem_11, Output Shape: ()
Layer Name: tf.math.subtract_16, Output Shape: ()
Layer Name: tf.cast_10, Output Shape: ()
Layer Name: tf.math.truediv_5, Output Shape: ()
Layer Name: tf.math.ceil_5, Output Shape: ()
Layer Name: tf.cast_11, Output Shape: ()
Layer Name: tf.math.subtract_15, Output Shape: ()
Layer Name: tf.math.multiply_5, Output Shape: ()
Layer Name: tf.math.maximum_5, Output Shape: ()
Layer Name: tf.math.subtract_17, Output Shape: ()
Layer Name: tf.__operators__.add_10, Output Shape: ()
Layer Name: tf.compat.v1.pad_5, Output Shape: (None, None)
Layer Name: tf.signal.stft_5, Output Shape: (None, None, 257)
Layer Name: tf.math.abs_5, Output Shape: (None, None, 257)
Layer Name: tf.tensor

In [30]:
model.fit(dataset, epochs=10, verbose=1)

Epoch 1/10


  43842/Unknown - 4941s 111ms/step - loss: 1.7759 - emotion_recognition_model_loss: 1.4653 - emotion_recognition_model_1_loss: 0.3105 - emotion_recognition_model_accuracy: 0.4322 - emotion_recognition_model_1_mae: 0.4188

KeyboardInterrupt: 

In [None]:
model.save('10epoch.h5')