In [1]:
from __future__ import division, print_function

import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf
from tensorflow.keras import Model, layers, optimizers


import params as yamnet_params
import yamnet as yamnet_model
import features as features_lib
from random import shuffle

import librosa
import json
import glob

from tqdm import tqdm


In [2]:
print("TensorFlow version:", tf.__version__)
tf.config.list_physical_devices('GPU')

TensorFlow version: 2.10.0


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
intense = {"약함" : 0.,"보통":1.,"강함":2.}
emotion_enc = {"기쁨" : 0, "사랑스러움" : 1, "두려움" : 2, "화남" : 3, "슬픔" : 4, "놀라움" : 5, "없음" : 6}
emotion_dec = { 0 : "기쁨", 1 : "사랑스러움", 2 : "두려움", 3 : "화남", 4 : "슬픔", 5 : "놀라움", 6 : "없음"}

In [5]:
params = yamnet_params.Params()

In [5]:
def load_segments_from_json(paths, wav_prefix):
    for path in paths:
        with open(path, 'r') as f:
            data = json.load(f)
        sr = int(data['Wav']['SamplingRate'])
        tmp_prefix = os.path.join(wav_prefix, "indoor" if "indoor" in path else "outdoor")
        wav_name = os.path.join(tmp_prefix, data['File']['FileName'] + ".wav")
        audio, _ = librosa.load(wav_name, sr=sr)
        audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        #y_emotion_class = []
        #y_emotion_intensity = []
        #audios = []

        for entry in data['Conversation']:
            start_time = float(entry['StartTime'].replace(",",""))
            end_time = float(entry['EndTime'].replace(",",""))
            emotion_category = emotion_enc[entry['VerifyEmotionTarget']]
            emotion_intense = intense[entry['VerifyEmotionLevel']]
            start_sample = int(start_time * 16000)
            end_sample = int(end_time * 16000)
            audio_segment = audio_resampled[start_sample:end_sample]

            #audios.append(np.array(audio_segments))
            #y_emotion_class.append(emotion_category)
            #y_emotion_intensity.append(emotion_level)
            print(len(audio_segment), len(audio_segment)/16000, emotion_dec[emotion_category], emotion_intense)
            yield np.array(audio_segment), emotion_category, emotion_intense

        #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

In [27]:
path = "Y:/emotion/train"
audio_source = "data"
label_str = "label"
indoor_ = ["indoor", "outdoor"] 
indoor_files = glob.glob(os.path.join(path,label_str,indoor_[0], '*.json'))
outdoor_files = glob.glob(os.path.join(path,label_str,indoor_[1], '*.json'))
indor_cnt = len(indoor_files)
full_data = indoor_files[:indor_cnt//4] + outdoor_files
#shuffle(full_data)
len(full_data)

3569

In [6]:
save_path = "D:/emotion_split/train"

In [7]:
os.makedirs(os.path.join(save_path, audio_source), exist_ok=True)
os.makedirs(os.path.join(save_path, label_str), exist_ok=True)

NameError: name 'audio_source' is not defined

In [30]:
for i, p in enumerate(tqdm(full_data)):
    with open(p, 'r') as f:
        data = json.load(f)
    sr = int(data['Wav']['SamplingRate'])
    tmp_prefix = os.path.join(os.path.join(path,audio_source), "indoor" if "indoor" in p else "outdoor")
    wav_name = os.path.join(tmp_prefix, data['File']['FileName'] + ".wav")
    audio, _ = librosa.load(wav_name, sr=sr)
    audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    for j, entry in enumerate(data['Conversation']):
        start_time = float(entry['StartTime'].replace(",",""))
        end_time = float(entry['EndTime'].replace(",",""))
        emotion_category = emotion_enc[entry['VerifyEmotionTarget']]
        emotion_intense = intense[entry['VerifyEmotionLevel']]
        text = entry['Text']
        start_sample = int(start_time * 16000)
        end_sample = int(end_time * 16000)
        audio_segment = audio_resampled[start_sample:end_sample]
        #audios.append(np.array(audio_segments))
        #y_emotion_class.append(emotion_category)
        #y_emotion_intensity.append(emotion_level)
        res = {'text' : text, "emotion_category" : emotion_category, "emotion_intense" : emotion_intense}
        sf.write(os.path.join(save_path, audio_source, f"{i:08}_{j:05}.wav"), 
                audio_segment, 
                16000, 
                format='WAV')
        with open(os.path.join(save_path, label_str, f'{i:08}_{j:05}.json'), 'w', encoding="utf-8") as f:
            json.dump(res, f, ensure_ascii=False, indent="\t")
        #yield np.array(audio_segment), emotion_category, emotion_intense
    #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

100%|██████████| 3569/3569 [42:47:43<00:00, 43.17s/it]    


In [8]:
def load_split_data_from_json(root_path):
    label_path = os.path.join(root_path, "label")
    data_path = os.path.join(root_path, "data")
    sr = 16000
    json_list = glob.glob(os.path.join(label_path,'*.json'))[:10]
    for path in json_list:
        with open(path, 'r') as f:
            data = json.load(f)
        
        wav_name = os.path.join(data_path, os.path.basename(path).split(".")[0] + ".wav")
        audio, _ = librosa.load(wav_name, sr=sr)

        text = data['text']
        emotion_category = data["emotion_category"]
        emotion_intense = data["emotion_intense"]
        
        yield np.array(audio), emotion_category, emotion_intense

        #return audios, np.array(y_emotion_class), np.array(y_emotion_intensity)

In [9]:
train_path = "D:/emotion_split/train"
#train_data_paths = glob.glob(os.path.join(train_path, "label", '*.json'))
#len(train_data_paths) #1065311

In [10]:
dataset = tf.data.Dataset.from_generator(
    lambda: load_split_data_from_json(train_path),
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.float32)
    )
)

In [11]:
def preprocess(audio_segment, emotion_category, emotion_intensity):
    waveform_padded = features_lib.pad_waveform(audio_segment, params)
    #_, features = features_lib.waveform_to_log_mel_spectrogram_patches(
    #    waveform_padded, params)
    #num_patches = tf.shape(features)[1]
    #tf.print(tf.shape(features))
    #emotion_class_repeated = tf.repeat(emotion_category[tf.newaxis], num_patches, axis=0)
    #emotion_intensity_repeated = tf.repeat(emotion_intensity[tf.newaxis], num_patches, axis=0)
    labels = {
        'emotion_class_output': emotion_category,
        'emotion_intensity_output': emotion_intensity
    }
    #tf.print(tf.shape(emotion_class_repeated), len(labels["emotion_class_output"]))
    return audio_segment, labels

In [12]:
dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(buffer_size=100)
batch_size = 4

dataset = dataset.padded_batch(
    batch_size,
    padded_shapes=([None, ], {'emotion_class_output': [], 'emotion_intensity_output': []}),
    padding_values=(0.0, {'emotion_class_output': 0, 'emotion_intensity_output': 0.0})
)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
def prediction_model(input_shape, num_classes):
    """Defines the prediction model for emotion classification and intensity regression."""
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    emotion_class_output = layers.Dense(num_classes, activation='softmax', name='emotion_class_output')(x)
    emotion_intensity_output = layers.Dense(1, activation='linear', name='emotion_intensity_output')(x)
    model = Model(inputs=inputs,  outputs= [emotion_class_output, emotion_intensity_output], name='emotion_recognition_model')
    return model

In [14]:
params = yamnet_params.Params()

embedding_model = yamnet_model.yamnet_embedding_model(params, "yamnet.h5")

for layer in embedding_model.layers:
    layer.trainable = False

num_classes = len(emotion_enc)
prediction_net = prediction_model(input_shape=(1024,), num_classes=num_classes)

waveform_input = embedding_model.inputs[0]
embeddings = embedding_model.outputs[0]
#waveform_input = layers.Input(shape=(None,), dtype=tf.float32, name='waveform_input')
#embeddings = embedding_model(waveform_input)
predictions = prediction_net(embeddings)

model = Model(inputs=waveform_input, outputs={"emotion_class_output": predictions[0],
             "emotion_intensity_output": predictions[1]}, name='emotion_recognition_model')

Successfully loaded weights from yamnet.h5
Weights file yamnet.h5 does not exist.


In [15]:
epoch = 5

lr_schedule = optimizers.schedules.CosineDecay(
    initial_learning_rate=0.01,
    decay_steps=epoch
)

optimizer = optimizers.Adam(lr_schedule)

model.compile(
    optimizer='adam',#optimizer,
    loss={
        'emotion_class_output': 'sparse_categorical_crossentropy',
        'emotion_intensity_output': 'mean_squared_error'
    },
    metrics={
        'emotion_class_output': 'accuracy',
        'emotion_intensity_output': 'mae'
    },
    run_eagerly=True
)

In [16]:
model.summary()

Model: "emotion_recognition_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 waveform_input (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 tf.compat.v1.shape (TFOpLambda  (1,)                0           ['waveform_input[0][0]']         
 )                                                                                                
                                                                                                  
 tf.__operators__.getitem (Slic  ()                  0           ['tf.compat.v1.shape[0][0]']     
 ingOpLambda)                                                                                     
                                                                          

In [17]:
model.fit(dataset, epochs=epoch, verbose=1)

Epoch 1/5


KeyboardInterrupt: 

In [6]:
model.save('10epoch.h5')

NameError: name 'model' is not defined

In [14]:
for data in dataset.take(1):
    feature, label = data
    feature_np = feature
    label_np = label
    
    feature_size = sys.getsizeof(feature_np)
    label_size = sys.getsizeof(label_np)
    print(data)
    print(f"Feature size: {feature_size} bytes")
    print(f"Label size: {label_size} bytes")
    

(<tf.Tensor: shape=(4, 38720), dtype=float32, numpy=
array([[-1.4953613e-03, -2.1057129e-03, -2.6245117e-03, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [-7.0190430e-04, -1.6174316e-03, -2.5939941e-03, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 9.1552734e-03,  8.7890625e-03,  8.3923340e-03, ...,
        -6.1035156e-05, -9.1552734e-05, -6.1035156e-05],
       [ 7.2937012e-03,  5.2185059e-03,  3.4790039e-03, ...,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00]], dtype=float32)>, {'emotion_class_output': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 0, 6])>, 'emotion_intensity_output': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 2., 1.], dtype=float32)>})
Feature size: 192 bytes
Label size: 232 bytes


In [15]:
gpus = tf.config.experimental.list_physical_devices('GPU')


In [18]:

memory_info_after = tf.config.experimental.get_memory_info('GPU:0')
memory_info_after

{'current': 3328, 'peak': 3328}