In [1]:
import pandas as pd
import librosa as lr
import soundfile as sf
import time
import warnings
import numpy as np
from os.path import exists


#### Helper

In [2]:
from IPython.display import HTML, display

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))


# Data Preperation (files and valence/arousal values)

In [3]:
audio_base_path = '/home/victor/IdeaProjects/music-emotion-detection/data/DEAM/audio/MEMD_audio/'
arousal_file = pd.read_csv('/home/victor/IdeaProjects/music-emotion-detection/data/DEAM/annotations/annotations averaged per song/dynamic (per second annotations)/arousal.csv')
valence_file = pd.read_csv('/home/victor/IdeaProjects/music-emotion-detection/data/DEAM/annotations/annotations averaged per song/dynamic (per second annotations)/valence.csv')

In [4]:
arousal_file.columns

Index(['song_id', 'sample_15000ms', 'sample_15500ms', 'sample_16000ms',
       'sample_16500ms', 'sample_17000ms', 'sample_17500ms', 'sample_18000ms',
       'sample_18500ms', 'sample_19000ms',
       ...
       'sample_622000ms', 'sample_622500ms', 'sample_623000ms',
       'sample_623500ms', 'sample_624000ms', 'sample_624500ms',
       'sample_625000ms', 'sample_625500ms', 'sample_626000ms',
       'sample_626500ms'],
      dtype='object', length=1225)

In [5]:
arousal_file.head()

Unnamed: 0,song_id,sample_15000ms,sample_15500ms,sample_16000ms,sample_16500ms,sample_17000ms,sample_17500ms,sample_18000ms,sample_18500ms,sample_19000ms,...,sample_622000ms,sample_622500ms,sample_623000ms,sample_623500ms,sample_624000ms,sample_624500ms,sample_625000ms,sample_625500ms,sample_626000ms,sample_626500ms
0,2,-0.109386,-0.114942,-0.116413,-0.118613,-0.126457,-0.133199,-0.136855,-0.144713,-0.138985,...,,,,,,,,,,
1,3,-0.110846,-0.123973,-0.131103,-0.135956,-0.140775,-0.144664,-0.163118,-0.165218,-0.158858,...,,,,,,,,,,
2,4,0.222327,0.179446,0.178388,0.184056,0.176042,0.17872,0.176345,0.175793,0.176154,...,,,,,,,,,,
3,5,-0.255613,-0.251579,-0.251958,-0.251124,-0.250763,-0.251957,-0.251957,-0.251957,-0.251957,...,,,,,,,,,,
4,7,0.464234,0.460789,0.460991,0.461046,0.45724,0.465702,0.471809,0.469918,0.473377,...,,,,,,,,,,


In [6]:
arousal_file.loc[:, arousal_file.columns != 'song_id'].describe()

Unnamed: 0,sample_15000ms,sample_15500ms,sample_16000ms,sample_16500ms,sample_17000ms,sample_17500ms,sample_18000ms,sample_18500ms,sample_19000ms,sample_19500ms,...,sample_622000ms,sample_622500ms,sample_623000ms,sample_623500ms,sample_624000ms,sample_624500ms,sample_625000ms,sample_625500ms,sample_626000ms,sample_626500ms
count,1802.0,1802.0,1802.0,1802.0,1802.0,1802.0,1802.0,1802.0,1802.0,1802.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,0.123961,0.124999,0.125694,0.12637,0.126515,0.126578,0.126825,0.127375,0.127409,0.127622,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378
std,0.266756,0.267682,0.268578,0.269261,0.26977,0.270248,0.270959,0.272065,0.272942,0.273953,...,,,,,,,,,,
min,-0.654305,-0.654348,-0.655628,-0.663082,-0.664821,-0.665411,-0.667466,-0.668622,-0.661271,-0.663044,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378
25%,-0.071864,-0.07,-0.07225,-0.072093,-0.07375,-0.069962,-0.076,-0.076922,-0.077,-0.077,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378
50%,0.139,0.14083,0.144438,0.144235,0.143667,0.140169,0.141,0.142247,0.142,0.144792,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378
75%,0.341,0.341581,0.343351,0.344,0.343506,0.344971,0.34675,0.348424,0.350978,0.351033,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378
max,0.678,0.687,0.688,0.696,0.693,0.688,0.695,0.698,0.702,0.696,...,-0.05,-0.05,-0.056,-0.17,-0.236,-0.332,-0.37,-0.392,-0.39,-0.378


In [7]:
songs = pd.DataFrame()

def add_mp3(value):
    return str(value) + '.mp3'
songs = songs.assign(file_name = arousal_file.loc[:, 'song_id'])
songs.file_name = songs['file_name'].transform(add_mp3)
songs = songs.assign(arousal_avg=arousal_file.loc[:, arousal_file.columns != 'song_id'].mean(axis=1))
songs = songs.assign(valence_avg=valence_file.loc[:, valence_file.columns != 'song_id'].mean(axis=1))

In [8]:
## Mean(Diviation of arousal float within the Songs)
print('\n\n Arousal Deviation within Songs')
print(arousal_file.loc[:, arousal_file.columns != 'song_id'].std(axis=1).describe())
print('\n\n Valence Deviation within Songs')
print(valence_file.loc[:, valence_file.columns != 'song_id'].std(axis=1).describe())



 Arousal Deviation within Songs
count    1802.000000
mean        0.040771
std         0.036643
min         0.003762
25%         0.018364
50%         0.030239
75%         0.050099
max         0.364901
dtype: float64


 Valence Deviation within Songs
count    1802.000000
mean        0.033866
std         0.025326
min         0.004015
25%         0.017378
50%         0.027297
75%         0.041837
max         0.236557
dtype: float64


There is not much deviation within the Songs. 
Meaning the position of the Window does not effect the arousal and valence values very heavily. Thats why we will only predict the Average values of a song not the specific time windows.

In [9]:
songs.describe()

Unnamed: 0,arousal_avg,valence_avg
count,1802.0,1802.0
mean,0.137346,0.097688
std,0.279826,0.234633
min,-0.673034,-0.637343
25%,-0.068675,-0.064162
50%,0.157133,0.109598
75%,0.369122,0.279371
max,0.752033,0.6376


In [10]:
songs

Unnamed: 0,file_name,arousal_avg,valence_avg
0,2.mp3,-0.197517,-0.215511
1,3.mp3,-0.193187,-0.265855
2,4.mp3,0.243072,0.155210
3,5.mp3,-0.236207,0.140160
4,7.mp3,0.376292,0.332455
...,...,...,...
1797,2054.mp3,-0.169137,0.095237
1798,2055.mp3,0.133553,0.082477
1799,2056.mp3,0.002820,0.154216
1800,2057.mp3,0.488453,-0.367627


In [11]:
train_size = 1000
test_size = 300
file_names_train = songs.loc[:train_size, 'file_name']
file_names_test = songs.loc[train_size:train_size+test_size, 'file_name']
y_a_train = songs.loc[:train_size, 'arousal_avg']
y_a_test = songs.loc[train_size:train_size+test_size, 'arousal_avg']
y_v_train = songs.loc[:train_size, 'valence_avg']
y_v_test = songs.loc[train_size:train_size+test_size, 'valence_avg']


In [12]:
y_v_train

0      -0.215511
1      -0.265855
2       0.155210
3       0.140160
4       0.332455
          ...   
996    -0.062767
997     0.046667
998    -0.215817
999     0.401883
1000    0.146517
Name: valence_avg, Length: 1001, dtype: float64

# MFCC Features

In [13]:
sr = 22050


In [14]:
def load_lr_files(files, save_wav=False):
    song_length = len(files)
    out = display(progress(0, song_length), display_id=True)

    samples = list()
    i = 1
    for file in files:
        wav_path = audio_base_path + file + '.wav'
        if exists(wav_path):
          sample, srn = lr.load(wav_path, sr=sr)
        else:
          sample, srn = lr.load(audio_base_path + file, sr=sr)
          if save_wav:
            sf.write(audio_base_path + file + '.wav', sample, sr)
        samples.append(sample)
        out.update(progress(i, song_length))
        if srn != sr:
            print('Wrong SR Now: %s' % srn)
        i+=1
    return samples

In [15]:
samples_train = load_lr_files(file_names_train, True)

In [16]:
len(samples_train)

1001

In [17]:
samples_test = load_lr_files(file_names_test, True)

In [18]:
def get_mfcc_list(lr_files, window_length=1920):
    mfcc_list = list()
    out = display(progress(0, len(lr_files)), display_id=True)
    i=0
    len_mfcc=(20, window_length)
    for audio in lr_files:
        mfcc  = lr.feature.mfcc(y=audio, sr=sr)[:, :window_length]
        if len_mfcc != mfcc.shape:
            print('Error! Not same %s' % str(mfcc.shape))
        len_mfcc = mfcc.shape
        mfcc_list.append(mfcc)
        i+=1
        out.update(progress(i, len(lr_files)))
    return np.array(mfcc_list)

In [19]:
mfcc_train = get_mfcc_list(samples_train)
mfcc_test = get_mfcc_list(samples_test)

In [20]:
mfcc_train.shape

(1001, 20, 1920)

In [21]:
test = mfcc_train.reshape(1001, 1920, 20)

In [22]:
samples_test

[array([ 0.10757446,  0.17294312,  0.12948608, ..., -0.08334351,
        -0.11959839,  0.        ], dtype=float32),
 array([ 0.0057373 ,  0.00848389,  0.00723267, ..., -0.01852417,
        -0.02325439,  0.        ], dtype=float32),
 array([0.00192261, 0.00280762, 0.00631714, ..., 0.05603027, 0.07394409,
        0.        ], dtype=float32),
 array([-0.02764893, -0.02560425, -0.03704834, ..., -0.02459717,
         0.00463867,  0.        ], dtype=float32),
 array([0.00326538, 0.01528931, 0.02987671, ..., 0.06704712, 0.08410645,
        0.        ], dtype=float32),
 array([-0.01040649, -0.00772095, -0.00686646, ..., -0.05984497,
        -0.08026123,  0.        ], dtype=float32),
 array([ 0.02642822,  0.02734375,  0.00256348, ..., -0.09448242,
        -0.12200928,  0.        ], dtype=float32),
 array([-0.02047729, -0.02706909, -0.03961182, ...,  0.04498291,
         0.05212402,  0.        ], dtype=float32),
 array([-0.01004028, -0.01461792, -0.01251221, ..., -0.08847046,
        -0.09973145

In [23]:
test.shape

(1001, 1920, 20)

In [24]:
y_a_test = y_a_test.reset_index()['arousal_avg']
y_v_test = y_v_test.reset_index()['valence_avg']

In [25]:
y_train = []
for i in range(len(y_a_train)):
  y_train.append([y_a_train[i], y_v_train[i]])

y_train = np.array(y_train)

y_test = []
for i in range(len(y_a_test)):
  y_test.append([y_a_test[i], y_v_test[i]])

y_test = np.array(y_test)

# Model

In [26]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [32]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn_layers = Sequential(
            # Defining a 2D convolution layer
            Conv2d(1, 4, kernel_size=6, stride=1, padding=0),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernesl_size=6, stride=2),
            # Defining another 2D convolution layer
            Conv2d(4, 4, kernel_size=6, stride=1, padding=1),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=4, stride=2),
        )

        self.linear_layers = Sequential(
            Linear(1904, 2)
        )

    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x


In [33]:
mfcc_train_reshaped = mfcc_train.reshape(len(mfcc_train), 1, 1920, 20)
mfcc_test_reshaped = mfcc_test.reshape(len(mfcc_test), 1, 1920, 20)

In [34]:
net = Net().to(device)
print(net)

Net(
  (cnn_layers): Sequential(
    (0): Conv2d(1, 4, kernel_size=(6, 6), stride=(1, 1))
    (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=6, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(4, 4, kernel_size=(6, 6), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (linear_layers): Sequential(
    (0): Linear(in_features=1904, out_features=2, bias=True)
  )
)


In [35]:
x_training = torch.tensor(mfcc_train_reshaped, device=device)
y_training = torch.tensor(y_train, device=device)
x_validation = torch.tensor(mfcc_test_reshaped, device=device)
y_validation = torch.tensor(y_test, device=device)
net = Net().to(device)

In [36]:
x_training.shape

torch.Size([1001, 1, 1920, 20])

In [37]:
criterion = nn.HuberLoss()
optimizer = optim.Adam(net.parameters())

In [38]:
train_losses = []
val_losses = []
def train(epoch):
    net.train()
    tr_loss = 0

    # converting the data into GPU format
    """if torch.cuda.is_available():
        x_train = x_train.cuda()
        y_train = y_train.cuda()
        x_val = x_val.cuda()
        y_val = y_val.cuda()"""

    # clearing the Gradients of the model parameters
    optimizer.zero_grad()

    # prediction for training and validation set
    output_train = net(x_training)
    output_val = net(x_validation)
    # computing the training and validation loss
    loss_train = criterion(output_train.float(), y_training.float())
    loss_val = criterion(output_val.float(), y_validation.float())
    train_losses.append(loss_train)
    val_losses.append(loss_val)

    # computing the updated weights of all the model parameters
    loss_train.backward()
    optimizer.step()
    tr_loss = loss_train.item()
    if epoch%2 == 0:
        # printing the validation loss
        print('Epoch : ',epoch+1, '\t', 'loss :', float(loss_val), '\t', 'train_loss:', float(loss_train))

In [39]:
for i in range(100):
  train(i)

RuntimeError: Given input size: (4x952x2). Calculated output size: (4x475x0). Output size is too small

In [222]:
net(torch.tensor(mfcc_train[50].reshape(1,1,1935,20), device=device))

tensor([[0.7307, 0.1580]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [223]:
y_train[50]

array([0.58268833, 0.19256875])