In [None]:
# mount google drive on colab
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
import numpy as np
import os

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization
from keras.layers import Flatten, Input, Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling2D, MaxPooling1D, Dense, GlobalMaxPooling1D, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import librosa
from sklearn.utils import shuffle
import pickle

# download from:
# https://github.com/SenticNet/MELD/blob/master/baseline/baseline.py

# DATA_PATH = '/content/gdrive/Team Drives/IKE Data/Emotion detection'
DATA_PATH = 'data'
DATASET_PATH = os.path.join(DATA_PATH, 'dataset.pkl')
NUM_CLASSES = 7

train = pd.read_csv(os.path.join(DATA_PATH, 'emorynlp_train_final.csv'))
val = pd.read_csv(os.path.join(DATA_PATH, 'emorynlp_dev_final.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'emorynlp_test_final.csv'))
train.head()


In [None]:
def preprocess(data, folder):
    data['label'] = data['Emotion'].astype('category').cat.codes

    labels = []
    spectrums = []

    for index, row in data.iterrows():
        filename = 'sea%d_ep%d_sc%d_utt%d.wav' % (row.Season, row.Episode, row.Scene_ID, row.Utterance_ID)
        path = os.path.join('data', 'audio', folder, filename)
        if not os.path.isfile(path):
            print('file does not exist', path)
            continue

        sample, sr = librosa.load(path, 24000, duration=2)
        assert sr == 24000, 'sample rate incorrect'

        padded_sample = np.zeros(48001, dtype='float32')
        padded_sample[:sample.shape[0]] = sample

        melgram = librosa.feature.melspectrogram(padded_sample, sr, n_fft=512, hop_length=256, n_mels=96)
        log_melgram = librosa.amplitude_to_db(melgram)
        log_melgram = np.expand_dims(log_melgram, axis=-1)
        assert log_melgram.shape == (96, 188, 1)

        labels.append(row.label)
        spectrums.append(log_melgram)

        if index % 500 == 0:
            print('loaded %d from %d rows' % (index, len(data)))
        
    x = np.stack(spectrums)
    y = np.stack(labels)

    y = to_categorical(y)

    # shuffle
    x, y = shuffle(x, y, random_state=42)
    return x, y

if os.path.isfile(DATASET_PATH):
    print("loading from cache")
    with open(DATASET_PATH, 'rb') as handle:
        train_x, train_y, test_x, test_y, val_x, val_y = pickle.load(handle)
        print(train_x.shape, train_y.shape)

else:        
    train_x, train_y = preprocess(train, 'train')
    test_x, test_y = preprocess(test, 'test')
    val_x, val_y = preprocess(val, 'dev')
    print(train_x.shape, train_y.shape)

    with open(DATASET_PATH, 'wb') as handle:
        pickle.dump((train_x, train_y, test_x, test_y, val_x, val_y), handle)    

In [None]:
# show some spectrograms
for i in range(0, 10):
    plt.imshow(np.squeeze(train_x[i]))
    plt.show()

In [None]:
model_input = Input(shape=train_x.shape[1:])

x = Conv2D(32, 5, activation='relu')(model_input)
x = MaxPooling2D(3)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Conv2D(64, 5, activation='relu')(x)
x = MaxPooling2D(3)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Conv2D(128, 3, activation='relu')(x)
x = GlobalMaxPooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)

x = Dense(7, activation='softmax')(x)

model = Model(model_input, x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=10, verbose=1)
print("Accuracy: %.2f" % model.evaluate(test_x, test_y)[1])

# base: 0.3417