In [1]:
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.io import wavfile

from sklearn.metrics import accuracy_score

from bokeh.plotting import figure, show
from bokeh.io import output_notebook


Using TensorFlow backend.


In [2]:
from keras.layers import Conv1D, MaxPool1D, Activation, Dense, Input, Flatten
from keras.losses import binary_crossentropy
from keras.utils import Sequence
import tensorflow as tf

# Prepare Training set

In [3]:
annotations_path = '../data/MagnaTagATune/annotation_reduced.csv'
annotations = pd.read_csv(annotations_path, sep='\t')
train_set, test_set = train_test_split(annotations['mp3_path'], train_size=0.8, test_size=0.2) 

x_train = []
x_test = []
for value in tqdm(train_set):
    path = '../data/MagnaTagATune/rawwav/'+value[:-3]+'wav'
    _, data = wavfile.read(path)
    x_train.append(data)
x_train = np.array(x_train)[:,:,np.newaxis]



with open('x_train','wb') as outfile:
    pickle.dump(x_train,outfile)
with open('x_test','wb') as outfile:
    pickle.dump(x_test,outfile)

print("X train shape:",x_train.shape)
print("X test shape:",x_test.shape)

In [4]:
train_set_paths = train_set.values
train_set_labels = annotations.loc[annotations['mp3_path'].isin(train_set)].drop(columns=['mp3_path','Unnamed: 0']).values
train_set_size = len(train_set_paths)
print("Train set size: {} ".format(train_set_size))

y_dimension = train_set_labels.shape[1]

_, data = wavfile.read( '../data/MagnaTagATune/rawwav/' + annotations['mp3_path'][0][:-3]+ 'wav')
x_dimension = len(data)

print("X dimension: {}\nY dimension: {}".format(x_dimension, y_dimension))

Train set size: 20688 
X dimension: 465984
Y dimension: 40


In [5]:
class MagnaTagATuneSequence(Sequence):

    def __init__(self, train_set_paths, train_set_labels, batch_size):
        self.paths, self.y = train_set_paths, train_set_labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.paths) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x_paths = self.paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = []
        for value in batch_x_paths:
            path = '../data/MagnaTagATune/rawwav/'+value[:-3]+'wav'
            _, data = wavfile.read(path)
            batch_x.append(data)
        batch_x = np.array(batch_x)[:,:,np.newaxis]
        return (batch_x,batch_y)

In [6]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
keras.backend.set_session(session)

model = keras.Sequential()
model.add(Conv1D(filters=10, kernel_size=13, strides=6, input_shape=(x_dimension,1)))
model.add(Activation('relu'))
model.add(MaxPool1D(pool_size=3)) #
model.add(Conv1D(filters=4, kernel_size=3, strides=1))
model.add(Activation('relu'))
model.add(MaxPool1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=y_dimension, activation='sigmoid'))

In [7]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 77662, 10)         140       
_________________________________________________________________
activation_1 (Activation)    (None, 77662, 10)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 25887, 10)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 25885, 4)          124       
_________________________________________________________________
activation_2 (Activation)    (None, 25885, 4)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 12942, 4)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 51768)             0         
__________

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
batch_size = 32
callbacks = [keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True,
                                         write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, 
                                         embeddings_metadata=None)]


model.fit_generator(MagnaTagATuneSequence(train_set_paths, train_set_labels, batch_size),
                    epochs=2, callbacks = callbacks)
#model.fit(x_train,y_train,epochs=10, batch_size=32, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6478574fd0>

# Build test set

In [11]:
test_set_paths = test_set.values
test_set_labels = annotations.loc[annotations['mp3_path'].isin(test_set)].drop(columns=['mp3_path','Unnamed: 0']).values
test_set_size = len(test_set_paths)
print("Test set size: {} ".format(test_set_size))

Test set size: 5172 


In [12]:
predictions = model.predict_generator(MagnaTagATuneSequence(test_set_paths, test_set_labels, batch_size), verbose=1)



In [32]:
predictions[3000]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0.], dtype=float32)

In [43]:
model.layers[-1].get_weights()[1][-10]

-0.005707355

In [50]:
model.predict((np.random.rand(1,x_dimension,1) + 2 )*3)

array([[0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 9.041844e-36, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        1.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00]],
      dtype=float32)