In [12]:
import pandas as pd
import numpy as np
import os

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization
from keras.layers import Flatten, Input, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical

import librosa

# data from:
# https://github.com/SenticNet/MELD/blob/master/baseline/baseline.py

train = pd.read_csv(os.path.join('data', 'emorynlp_train_final.csv'))
val = pd.read_csv(os.path.join('data', 'emorynlp_dev_final.csv'))
test = pd.read_csv(os.path.join('data', 'emorynlp_test_final.csv'))
# train = train.sample(frac=1)
train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Scene_ID,Utterance_ID,Season,Episode,Start_Time,End_Time
0,"What you guys don't understand is, for us, kis...",['Monica Geller'],Joyful,1,1,1,2,00:00:02.877,00:00:07.548
1,"Yeah, right!.......Y'serious?",['Joey Tribbiani'],Neutral,1,2,1,2,00:00:04.504,00:00:07.548
2,"Oh, yeah!",['Phoebe Buffay'],Joyful,1,3,1,2,00:00:07.924,00:00:09.508
3,Everything you need to know is in that first k...,['Rachel Green'],Powerful,1,4,1,2,00:00:11.970,00:00:17.683
4,Absolutely.,['Monica Geller'],Powerful,1,5,1,2,00:00:14.139,00:00:15.097


In [15]:
train['label'] = train['Emotion'].astype('category').cat.codes
# n_classes = labels.max() + 1
# labels = to_categorical(labels)
# labels, n_classes

In [None]:
labels = []
spectrums = []

for index, row in train.iterrows():
    filename = 'sea%d_ep%d_sc%d_utt%d.wav' % (row.Season, row.Episode, row.Scene_ID, row.Utterance_ID)
    path = os.path.join('data', 'audio', 'train', filename)
    if not os.path.isfile(path):
        print('file does not exist', path)
        continue

    sample, sr = librosa.load(path, 24000, duration=2)
    assert sr == 24000, 'sample rate incorrect'
    
    padded_sample = np.zeros(48001, dtype='float32')
    padded_sample[:sample.shape[0]] = sample
            
    melgram = librosa.feature.melspectrogram(padded_sample, sr, n_fft=512, hop_length=256, n_mels=96)
    log_melgram = librosa.amplitude_to_db(melgram)
    log_melgram = np.expand_dims(log_melgram, axis=-1)
    assert log_melgram.shape == (96, 188, 1)
    
    labels.append(row.label)
    spectrums.append(log_melgram)
    
    if index % 100 == 0:
        print('loaded %d from %d rows' % (index, len(train)))
    

loaded 0 from 7551 rows
loaded 100 from 7551 rows
loaded 200 from 7551 rows
loaded 300 from 7551 rows
loaded 400 from 7551 rows
loaded 500 from 7551 rows
loaded 600 from 7551 rows
loaded 700 from 7551 rows
loaded 800 from 7551 rows
loaded 900 from 7551 rows
loaded 1000 from 7551 rows
loaded 1100 from 7551 rows
loaded 1200 from 7551 rows
loaded 1300 from 7551 rows
loaded 1400 from 7551 rows
loaded 1500 from 7551 rows
loaded 1600 from 7551 rows
loaded 1700 from 7551 rows
loaded 1800 from 7551 rows
loaded 1900 from 7551 rows
loaded 2000 from 7551 rows
loaded 2100 from 7551 rows
loaded 2200 from 7551 rows
loaded 2300 from 7551 rows
loaded 2400 from 7551 rows
loaded 2500 from 7551 rows
loaded 2600 from 7551 rows
loaded 2700 from 7551 rows
loaded 2800 from 7551 rows
loaded 2900 from 7551 rows
loaded 3000 from 7551 rows
loaded 3100 from 7551 rows
loaded 3200 from 7551 rows
loaded 3300 from 7551 rows
loaded 3400 from 7551 rows
loaded 3500 from 7551 rows
loaded 3600 from 7551 rows
loaded 3700 f

In [None]:
from sklearn.utils import shuffle

x = np.stack(spectrums)
y = np.stack(labels)
# shuffle
train_x, train_y = shuffle(x, y)
print(x.shape, y.shape)

In [None]:
model_input = Input(shape=input_shape)

x = Conv2D(32, 5, activation='relu')(model_input)
x = MaxPooling2D(3)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Conv2D(64, 5, activation='relu')(x)
x = MaxPooling2D(3)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Conv2D(128, 3, activation='relu')(x)
x = GlobalMaxPooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(model_input, x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=20, verbose=0)
