# Genre Identifier

Creates a neural network that recognizes the genre of a song

feature explanation courtesy of:
https://navdeepsinghh.medium.com/identifying-the-genre-of-a-song-with-neural-networks-851db89c42f0

In [129]:
import librosa
import librosa.feature
import librosa.display
import glob
import numpy as np
# import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils.np_utils import to_categorical
# from keras.models import model_from_json
from keras.models import load_model
from ..DancingAgent import

## 1. Load the data

Load the data into one vector containg all the information. Data we use for training and testing is the GTZAN data set (https://www.tensorflow.org/datasets/catalog/gtzan).

In [130]:
def extract_song_features(f):
    y, _ = librosa.load(f)

    # get Mel-frequency cepstral coefficients and normalize
    mfcc = librosa.feature.mfcc(y)
    mfcc /= np.amax(np.absolute(mfcc))
    return np.ndarray.flatten(mfcc)[:25000]

In [131]:
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
all_features = []
all_labels = []

# load all songs from the gtzan data set
for genre in GENRES:
    sound_files = glob.glob('genres/' + genre + '/*.wav')
    print('Processing %d songs in %s genre...' % (len(sound_files), genre))
    for f in sound_files:
        extracted_features = extract_song_features(f)
        all_features.append(extracted_features)
        all_labels.append(genre)

# convert labels to one-hot encoding
label_uniq_ids, label_row_ids = np.unique(all_labels, return_inverse=True)
label_row_ids = label_row_ids.astype(np.int32, copy=False)
onehot_labels = to_categorical(label_row_ids, len(label_uniq_ids))

# store features and labels
features = np.stack(all_features)
labels = onehot_labels

Processing 100 songs in blues genre...
Processing 100 songs in country genre...
Processing 100 songs in disco genre...
Processing 100 songs in hiphop genre...
Processing 100 songs in jazz genre...
Processing 100 songs in metal genre...
Processing 100 songs in pop genre...
Processing 100 songs in reggae genre...
Processing 100 songs in rock genre...


In [132]:
# get shapes
print(np.shape(features))
print(np.shape(labels))

(900, 25000)
(900, 9)


## 2. Prepare the data set and build the model

In [138]:
def split_train_test(data, training_split=0.8):
    """
    this splits according to the ration we want to split with
    """
    np.random.shuffle(data)
    split_idx = int(len(data) * training_split)
    train, test = data[:split_idx, :], data[split_idx:, :]

    #      train data,     train labels,  test data,      test labels
    return train[:, :-10], train[:, -10:], test[:, :-10], test[:, -10:]

In [139]:
def build_model(train):
    """
    build the model; in our case it is a perceptron with 100 layers
    """
    print(f"shape of input data: {np.shape(train)[1]}")
    nn_model = Sequential([
        Dense(90, input_dim=np.shape(train)[1]),
        Activation('relu'),
        Dense(9),
        Activation('softmax'),
    ])


    # specify which techniques you want to use for training
    nn_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
    print(nn_model.summary())
    return nn_model

In [140]:
# last column has genre, turn it into unique ids
all_data = np.column_stack((features, labels))

# split into training and test data
train_input, train_labels, test_input, test_labels = split_train_test(all_data)

# build the model
model = build_model(train_input)

shape of input data: 25000
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 90)                2250090   
_________________________________________________________________
activation_19 (Activation)   (None, 90)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 9)                 819       
_________________________________________________________________
activation_20 (Activation)   (None, 9)                 0         
Total params: 2,250,909
Trainable params: 2,250,909
Non-trainable params: 0
_________________________________________________________________
None


In [141]:
print(train_input.shape)

(720, 25000)


## 3. Run the process

Now, we train our model and evaluate it

In [142]:
# train the model
model.fit(train_input, train_labels, epochs=10, batch_size=32,
          validation_split=0.2)

# now get the performance indicators
loss, acc = model.evaluate(test_input, test_labels, batch_size=32)

print("Done!")
print("Loss: %.4f, accuracy: %.4f" % (loss, acc))

Train on 576 samples, validate on 144 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Done!
Loss: 1.5543, accuracy: 0.4778


## 4. Store the classifier to use it in thinking.py

use JSON for simplicity

In [143]:
# save model
model.save("model.h5")
print("Saved model to disk")

Saved model to disk


## 5. Retrieve model and test it to see if everything works

make sure you have the correct version of h5py (2.10.0). This creates warnings at one end, but was the only thing that worked for me.

In [144]:
# load model
model = load_model('model.h5')
print("Loaded model from disk")
# summarize model.
model.summary()

Loaded model from disk
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 90)                2250090   
_________________________________________________________________
activation_19 (Activation)   (None, 90)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 9)                 819       
_________________________________________________________________
activation_20 (Activation)   (None, 9)                 0         
Total params: 2,250,909
Trainable params: 2,250,909
Non-trainable params: 0
_________________________________________________________________


In [148]:
# evaluate loaded model on single song
test_genre = "rock"
own_audio = extract_song_features("../recordings/output.wav")
some_audio = extract_song_features("genres/" + test_genre + "/" + test_genre + ".00000.wav")

# reshape for input
test_in = own_audio[np.newaxis,:]
test_in.shape

(1, 25000)

In [149]:
# now predict on test in
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = model.predict(test_in, verbose=0)

#np.set_printoptions(precision=2)
print(GENRES)
print(score[-1])

['blues', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
[1.05e-02 4.28e-03 6.06e-04 1.29e-03 8.19e-01 1.59e-01 1.08e-05 2.54e-03
 2.88e-03]


In [150]:
# now output the name
max_class = score.argmax(axis=-1)
print(f"real genre: {test_genre}\npredicted genre: {GENRES[max_class[0]]}")

real genre: rock
predicted genre: jazz
