In [1]:
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

import matplotlib.pyplot as plt
import librosa
import librosa.display



from bokeh.plotting import figure, show
from bokeh.io import output_notebook

from keras.layers import Conv2D, MaxPool2D, Activation, Dense, Input, Flatten, BatchNormalization, Dropout
from keras.losses import binary_crossentropy
from keras.optimizers import SGD
from keras.utils import Sequence
import keras.backend as K

from scipy.io import wavfile
from scipy.fftpack import fft

from sklearn.model_selection import train_test_split

import time

from pylab import rcParams

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
checkpoint_dir = './checkpoints_RAW/'

def find_best_checkpoint(prev_chkpts):
    best_ratio = np.inf
    best_chkpt = ''
    best_epoch = 0
    for chkpt in prev_chkpts:
        epoch = int(chkpt[8:11])
        ratio = float(chkpt[12:19])
        
        if ratio < best_ratio:
            best_ratio = ratio
            best_chkpt = chkpt
            best_epoch = epoch
    print('\n starting from model {} \n'.format(best_chkpt))
    return best_chkpt, best_epoch

In [3]:
previous_checkpoints = os.listdir(checkpoint_dir)
best_checkpoint, best_epoch = find_best_checkpoint(previous_checkpoints)
#model.load_weights(checkpoint_dir + best_checkpoint)
model = keras.models.load_model(checkpoint_dir + best_checkpoint)


 starting from model weights-031-0.13050.hdf5 





In [4]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 155328, 128)       512       
_________________________________________________________________
batch_normalization_1 (Batch (None, 155328, 128)       512       
_________________________________________________________________
activation_1 (Activation)    (None, 155328, 128)       0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 155328, 128)       49280     
_________________________________________________________________
batch_normalization_2 (Batch (None, 155328, 128)       512       
_________________________________________________________________
activation_2 (Activation)    (None, 155328, 128)       0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 51776, 128)        0         
__________

In [5]:
# this is the placeholder for the input images
input_model = model.input
song_lenght = 465984

# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers[0:] if 'activation' in layer.name])
layer_dict

{'activation_1': <keras.layers.core.Activation at 0x7f43c41c1ac8>,
 'activation_2': <keras.layers.core.Activation at 0x7f448a288cc0>,
 'activation_3': <keras.layers.core.Activation at 0x7f43c3d2d128>,
 'activation_4': <keras.layers.core.Activation at 0x7f43c3c4eeb8>,
 'activation_5': <keras.layers.core.Activation at 0x7f43c3ab80b8>,
 'activation_6': <keras.layers.core.Activation at 0x7f43c3940940>,
 'activation_7': <keras.layers.core.Activation at 0x7f43c389fb38>,
 'activation_8': <keras.layers.core.Activation at 0x7f43c3764b00>,
 'activation_9': <keras.layers.core.Activation at 0x7f43c3629a90>}

In [6]:
def normalize(x):
    # utility function to normalize a tensor by its L2 norm
    return x / (K.sqrt(K.mean(K.square(x))) + K.epsilon())

In [7]:
annotations_path = '../data/MagnaTagATune/annotation_reduced_50.csv'
dataset_dir = '../data/MagnaTagATune/rawwav_2/'

annotations = pd.read_csv(annotations_path, sep='\t')


tot_t_size = 0.866203
tot_train_set, test_set = train_test_split(annotations, train_size=tot_t_size, test_size=(1-tot_t_size), random_state=42) 

print("Complete Train set size: {}".format(tot_train_set.shape[0]))
print("Test set size: {} \n".format(test_set.shape[0]))

t_size = 0.91429
train_set, val_set = train_test_split(tot_train_set, train_size=t_size, test_size=(1-t_size), random_state=42) 

print("Train set size: {}".format(train_set.shape[0]))
print("Validation set size: {} \n".format(val_set.shape[0]))

train_set_paths = train_set['mp3_path'].values
train_set_labels = train_set.drop(columns=['mp3_path','Unnamed: 0']).values

y_dimension = train_set_labels.shape[1]
_, data = wavfile.read( dataset_dir + annotations['mp3_path'][0][:-3]+ 'wav')
x_dimension = len(data)

print("X dimension: {}\nY dimension: {} \n".format(x_dimension, y_dimension))

   
val_set_paths = val_set['mp3_path'].values
val_set_labels = val_set.drop(columns=['mp3_path','Unnamed: 0']).values
    

Complete Train set size: 22400
Test set size: 3460 

Train set size: 20480
Validation set size: 1920 

X dimension: 465984
Y dimension: 50 



In [8]:
class MagnaTagATuneSequence(Sequence):

    def __init__(self, train_set_paths, train_set_labels, batch_size):
        self.paths, self.y = train_set_paths, train_set_labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.paths) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x_paths = self.paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = []
        for value in batch_x_paths:
            path = dataset_dir + value[:-3]+'wav'
            _, data = wavfile.read(path)
            batch_x.append(data)
        batch_x = np.array(batch_x)[:,:,np.newaxis]        
        return (batch_x,batch_y)    

In [9]:
n_gpus = 4
parallel_model = keras.utils.multi_gpu_model(model, gpus=n_gpus)

y_pred = parallel_model.predict_generator(MagnaTagATuneSequence(train_set_paths,train_set_labels,32), verbose=True)



In [11]:
import pickle as pk
pk.dump(y_pred, open('y_train_pred_raw.p', 'wb' ))

In [None]:
outputs_layers = dict()
for layer in conv_layers:
    intermediate_layer_model = keras.Model(inputs=model.input,
                                 outputs=model.get_layer(layer).output)
    outputs_layers[layer] = intermediate_layer_model.predict(song[np.newaxis,:,np.newaxis])
    


In [None]:
layer_idx = 0
outputs = outputs_layers[conv_layers[layer_idx]][0,:,:]

print('Filters shape: {}'.format(outputs.shape))

# Build fft for each output of the filter and plot everything
spectrum = (np.array([np.abs(np.fft.rfft(output)[0:nUniquePts]) for output in outputs.T]).T)
plt.figure(figsize=(10,100))
plt.imshow(spectrum.T)

In [None]:
# we only scan through the first 200 filters
# but there are actually 512 of them


annotations = pd.read_csv('../data/MagnaTagATune/annotation_reduced_50.csv', sep='\t')

for lbl_index in range(len(annotations.columns.values[1:-1])):
    print('Finding input minimizing label: {}'.format(annotations.columns.values[1:-1][lbl_index]))
    start_time = time.time()
    # we build a loss function that maximizes the activation
    # of the nth filter of the layer considered
    output = model.output
    des = np.zeros(50)
    des[lbl_index] = 1
    desired_output = K.variable(np.array(des))
    loss = K.mean(K.pow(output - desired_output,2))

    # we compute the gradient of the input picture wrt this loss
    grads = K.gradients(loss, input_model)[0]
    # normalization trick: we normalize the gradient
    grads = normalize(grads)
    # this function returns the loss and grads given the input picture
    iterate = K.function([input_model], [loss, grads])
    # step size for gradient ascent
    step = 1000.

    # we start from a gray image with some random noise
    input_song_data = np.float32(np.random.randint(-30000,30000, size = (1, song_lenght, 1)))
    
    for i in range(20):
        loss_value, grads_value = iterate([input_song_data])
        input_song_data -= grads_value * step
        print('Current loss value:', loss_value)
    end_time = time.time()
    wavfile.write('{}_sound.wav'.format(annotations.columns.values[1:-1][lbl_index]),sr, input_song_data[0,:,0])
    print('processed in {}s'.format(end_time - start_time))

In [None]:
model.predict(input_song_data)

In [None]:
plt.plot(time_range, input_song_data[0,:,0])
plt.show()

In [None]:
file_dir = './grad_ascent_songs/'
S = []
for file in os.listdir(file_dir):
    sr, data = wavfile.read(file_dir + file)
    S.append(data)
S = np.array(S)

In [None]:
def plot_song(song, sr):
    time_interval = 1/sr
    time_range = (np.arange(0, len(song))*time_interval)
    plt.plot(time_range, song)
    
def plot_fft(song, sr):
    n = len(song) 
    p = fft(song)
    nUniquePts = int(np.ceil((n+1)/2.0))
    p = p[0:nUniquePts]
    p = np.abs(p)

    if n % 2 > 0: # we've got odd number of points fft
        p[1:len(p)] = p[1:len(p)] * 2
    else:
        p[1:len(p) -1] = p[1:len(p) - 1] * 2
    
    freqArray = np.arange(0, nUniquePts, 1.0) * (sr / n);
    plt.plot(freqArray, p, color='k')

In [None]:
plot_song(S[0,:], 16000)

In [None]:
plot_fft(S[49,:], 16000)

In [None]:
plot_song(S[2,:], 16000)

In [None]:
plot_song(S[40,:], 16000)

In [None]:
np.random.randint(-30000,30000, size = (1, song_lenght, 1)).shape

In [None]:
input_song_data = (np.random.randint(-30000,30000, size = (1, song_lenght, 1)))
input_song_data[0,:,0] = np.float(input_song_data[0,:,0])