# Music classification and retrieval

In [None]:
#coding: utf-8

import warnings
warnings.simplefilter("ignore")

import os
import sys 
import wave
import time
import librosa, librosa.display
import numpy as np
from utils import *
import matplotlib.pyplot as plt
import IPython
from IPython.display import Audio
from StringIO import StringIO

%pylab inline

In [None]:
!wget https://www.dropbox.com/s/jtdc7y0bi00ii4p/genres.tar.gz?dl=0 -O genres.tar.gz 
!tar -xzf genres.tar.gz 

In [None]:
sound_file = './genres/blues/blues.00000.au'
x, sample_rate = librosa.load(sound_file)
Audio(x, rate=sample_rate)

# Sound as 1D-Signal

In [None]:
plt.figure(figsize=(20,4))
pylab.plot(1.0* np.arange(len(x)) / sample_rate, x, 'k')
pylab.xlim([0, 10])
pylab.show()

# Sound as 2D-Signal

In [None]:
S = librosa.feature.melspectrogram(x, sr=sample_rate, n_mels=128)
log_S = librosa.logamplitude(S, ref_power=np.max)

In [None]:
plt.figure(figsize=(20,4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel', cmap='hot')
plt.title('mel power spectrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

In [None]:
def get_spectrogram(fname):
    y, sr = librosa.load(fname)
    S = librosa.feature.melspectrogram(y, sr=sample_rate, n_mels=128)
    log_S = librosa.logamplitude(S, ref_power=np.max)
    return log_S[:, :1200]

def plot_spectrogramm(log_S):
    plt.figure(figsize=(20,4))
    librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel', cmap='hot')
    plt.title('mel power spectrogram')
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()

# Prepare data

In [None]:
genres = ['blues', 'country', 'hiphop', 'metal', 'reggae', 'classical', 'disco', 'jazz', 'pop', 'rock']

id2genre = dict()
X_names, y = [], []
for genre_id, genre in enumerate(genres):
    id2genre[genre_id] = genre
    for track in os.listdir('./genres/' + genre):
        if '.mp3' in track or '.au' in track and '_' not in track:
            trackfile = os.path.join('./genres/', genre, track)
            X_names.append(trackfile)
            y.append(genre_id)

In [None]:
from multiprocessing import Pool
#compute all spectrograms
n_cpu = 5
X = Pool(n_cpu).map(get_spectrogram, X_names)

# Nearest Neighbors genre classification

In [None]:
idx = np.random.permutation(len(y))
X, X_names, y = np.array(X)[idx].astype('float32'), np.array(X_names)[idx], np.array(y)[idx]
X_reshaped = X.reshape(X.shape[0], X.shape[1], X.shape[2])

n_train = 800
X_train, X_valid = X_reshaped[:n_train], X_reshaped[n_train:]
y_train, y_valid = y[:n_train], y[n_train:]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_jobs=n_cpu)

clf = #train clf
y_val_pred = #make prediction on validation set>

print accuracy_score(y_valid, y_val_pred)

# Convolution Neural Nets

![](http://benanne.github.io/images/spotify_convnet.png)

http://benanne.github.io/2014/08/05/spotify-cnns.html

In [None]:
import theano
import lasagne
import theano.tensor as T

In [None]:
input_X, target_y = T.tensor3("X", dtype='float32'), T.vector("y", dtype='int32')
nn = lasagne.layers.InputLayer(shape=(None, X.shape[1], X.shape[2]), input_var=input_X)

nn = #Build your convnet using Conv1DLayer, MaxPool1DLayer, GlobalPoolLayer, or others

In [None]:
y_predicted = lasagne.layers.get_output(nn)
params = lasagne.layers.get_all_params(nn, trainable=True)

loss = #define loss function
accuracy = #define accuracy
updates = #here goes your favorite optimizer

In [None]:
train_fn = theano.function([input_X, target_y], [loss, accuracy], allow_input_downcast=True, updates=updates)
test_fn  = theano.function([input_X, target_y], [loss, accuracy], allow_input_downcast=True)
predict_fn  = theano.function([input_X], y_predicted)

In [None]:
%%time 

conv_nn = train_net(nn, train_fn, test_fn, X_train, y_train, X_valid, y_valid, num_epochs=100, batch_size=50)

# Find Simular Tracks

<img src="./img/cnn_gr.png" width="500">

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
features_layer = #choose layer to for feature extraction (please don't pick the last layer!)
features = lasagne.layers.get_output(features_layer, deterministic=True)
features_fn = theano.function([input_X], features, allow_input_downcast=True)

In [None]:
f = lambda x: np.array(features_fn([x]))
track_vectors = map(f, X_train) + map(f, X_valid)
track_vectors = np.concatenate(track_vectors, axis=0)

In [None]:
nn_pred = NearestNeighbors(metric='cosine', algorithm='brute')
nn_pred = nn_pred.fit(track_vectors)

In [None]:
ans = list(X_names[nn_pred.kneighbors(track_vectors[0])[1][0]])
print ans
#most of the nearest tracks should be from the same genre
#if the feature extraction works correctly

In [None]:
#nearest tracks should be similar
sound_file = ans[0]
x, sample_rate = librosa.load(sound_file)
Audio(x, rate=sample_rate)

In [None]:
sound_file = ans[1]
x, sample_rate = librosa.load(sound_file)
Audio(x, rate=sample_rate)

In [None]:
sound_file = ans[2]
x, sample_rate = librosa.load(sound_file)
Audio(x, rate=sample_rate)

# t-SNE visualization

Help: https://lts2.epfl.ch/blog/perekres/category/visualizing-hidden-structures-in-datasets-using-deep-learning/

In [None]:
from sklearn.manifold import TSNE

In [None]:
f = lambda x: np.array(features_fn([x]))
track_vectors = map(f, X_train) + map(f, X_valid)
track_vectors = np.concatenate(track_vectors, axis=0)

track_labels = np.array(list(y_train) + list(y_valid))

In [None]:
X_tsne = #apply tSNE
#sklearn t-SNE manual also recommends to reduce dimensions of your data with PCA before applying t-SNE
#http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

In [None]:
plt.figure(figsize=(10, 10))
colors = cm.plasma(np.linspace(0, 1.0, len(id2genre)))

for idx, gener in id2genre.items():
    idx_ = np.where(track_labels == idx)
    plt.scatter(X_tsne[:, 0][idx_], X_tsne[:, 1][idx_], c=colors[idx], label=gener)

plt.legend(loc=0, ncol=5)
plt.show()

# Grading

Maximum grade for this notebook is 9 points

* train neural network for music style classification. Your grade will depend on validation set accuracy
    * 40% - 2 points
    * 60% - 4 points
    * 80% - 6 points
* music retrieval works correctly - 3 points

Correct music retrieval means:
* Same genre for most of the nearest neighbors. Using predicted probabilities as a feature to satisfy this rule is cheating!
* Reasonable level of perceptual similarity of nearest neightbours
* t-SNE plot looks likes tsne_example.png or better (tight clusters and fine structure is betters, random scatter is worse)