# Genre Identifier

Creates a neural network that recognizes the genre of a song

feature explanation courtesy of:
https://navdeepsinghh.medium.com/identifying-the-genre-of-a-song-with-neural-networks-851db89c42f0

In [125]:
import librosa
import librosa.feature
import librosa.display
import glob
import numpy as np
# import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout,  Conv1D, Conv2D, Flatten, BatchNormalization, ZeroPadding2D,  MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add
from keras.utils.np_utils import to_categorical
# from keras.models import model_from_json
from keras.models import load_model
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

## 1. Load the data

Load the data into one vector containg all the information. Data we use for training and testing is the GTZAN data set (https://www.tensorflow.org/datasets/catalog/gtzan).

In [128]:
def filter_f(file):
    lo, hi = 600,10000
    y, sr = librosa.load(file)
    b, a = butter(N=6, Wn=[2*lo/sr, 2*hi/sr], btype='band')
    x = lfilter(b,a,y)
    return x, sr

def extract_song_features(f):
    #y, _ = librosa.load(f)
    y, sr = filter_f(f)

    # get Mel-frequency cepstral coefficients and normalize
    mfcc = librosa.feature.mfcc(y)
    mfcc /= np.amax(np.absolute(mfcc))
    return np.ndarray.flatten(mfcc)[:25000]
    
    # get melspectrogram
    #spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=2048, hop_length=512)
    #spect = librosa.power_to_db(spect, ref=np.max)
    #spect /= np.amax(np.absolute(spect))
    #return np.ndarray.flatten(spect)[:25000]

In [129]:
# GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
GENRES = ['classical', 'jazz', 'metal']
all_features = []
all_labels = []

# load all songs from the gtzan data set
for genre in GENRES:
    sound_files = glob.glob('genres/' + genre + '/*.wav')
    print('Processing %d songs in %s genre...' % (len(sound_files), genre))
    for f in sound_files:
        extracted_features = extract_song_features(f)
        all_features.append(extracted_features)
        all_labels.append(genre)

# convert labels to one-hot encoding
label_uniq_ids, label_row_ids = np.unique(all_labels, return_inverse=True)
label_row_ids = label_row_ids.astype(np.int32, copy=False)
onehot_labels = to_categorical(label_row_ids, len(label_uniq_ids))

# store features and labels
features = np.stack(all_features)
labels = onehot_labels

Processing 100 songs in classical genre...
Processing 100 songs in jazz genre...
Processing 100 songs in metal genre...


In [44]:
# get shapes
print(np.shape(features))
print(np.shape(labels))

(400, 25000)
(400, 4)


# Perceptron
## 2. Prepare the data set and build the model

In [46]:
def split_train_test(data, training_split=0.8, n_classes=10):
    """
    this splits according to the ration we want to split with
    """
    np.random.shuffle(data)
    split_idx = int(len(data) * training_split)
    train, test = data[:split_idx, :], data[split_idx:, :]

    #      train data,     train labels,  test data,      test labels
    return train[:, :-n_classes], train[:, -n_classes:], test[:, :-n_classes], test[:, -n_classes:]

In [70]:
def build_model(train):
    """
    build the model; in our case it is a perceptron with 100 layers
    """
    input_shape = np.shape(train[0])
    print(f"shape of input data: {input_shape}")
    nn_model = Sequential([
        Dense(100, input_dim=np.shape(train)[1]),
        Activation('relu'),
        Dense(10),
        Activation('softmax'),
    ])

    # specify which techniques you want to use for training
    nn_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
    print(nn_model.summary())
    return nn_model


In [72]:
# last column has genre, turn it into unique ids
all_data = np.column_stack((features, labels))

# split into training and test data
train_input, train_labels, test_input, test_labels = split_train_test(all_data)

# Reshape for CNN input
train_in = np.array([x.reshape( (125, 200, 1) ) for x in train_input])
test_in = np.array([x.reshape( (125, 200, 1) ) for x in test_input])

# build the nn model
model = build_model(train_in)
print(f"shape of input data: {np.shape(train_input[np.newaxis,:,:])}")

shape of input data: (125, 200, 1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 123, 198, 100)     1000      
_________________________________________________________________
average_pooling2d_1 (Average (None, 61, 99, 100)       0         
_________________________________________________________________
activation_1 (Activation)    (None, 61, 99, 100)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 603900)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 603900)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               60390100  
_________________________________________________________________
activation_2 (Acti

In [73]:
print(all_data.shape)
print(train_input.shape)
print(train_labels.shape)
#print(train_in.shape)

(1000, 25010)
(800, 25000)
(800, 10)


## 3. Run the process

Now, we train our model and evaluate it

In [None]:
# train the model
model.fit(train_in, train_labels, epochs=10, batch_size=32,
          validation_split=0.2)

# now get the performance indicators
loss, acc = model.evaluate(test_in, test_labels, batch_size=32)

print("Done!")
print("Loss: %.4f, accuracy: %.4f" % (loss, acc))


Train on 640 samples, validate on 160 samples
Epoch 1/10


## 4. Store the classifier to use it in thinking.py

use JSON for simplicity

In [31]:
# save model
model.save("model.h5")
print("Saved model to disk")

Saved model to disk


## 5. Retrieve model and test it to see if everything works

make sure you have the correct version of h5py (2.10.0). This creates warnings at one end, but was the only thing that worked for me.

In [32]:
# load model
model = load_model('model.h5')
print("Loaded model from disk")
# summarize model.
model.summary()

Loaded model from disk
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 100)               2500100   
_________________________________________________________________
activation_5 (Activation)    (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                1010      
_________________________________________________________________
activation_6 (Activation)    (None, 10)                0         
Total params: 2,501,110
Trainable params: 2,501,110
Non-trainable params: 0
_________________________________________________________________


In [46]:
y, _ = librosa.load("../recordings/output.wav")
# get Mel-frequency cepstral coefficients and normalize
mfcc = librosa.feature.mfcc(y)
mfcc /= np.amax(np.absolute(mfcc))
own_audio = np.ndarray.flatten(mfcc)[:25000]

# evaluate loaded model on single song
test_genre = "blues"
#own_audio = extract_song_features("../recordings/output.wav")
some_audio = extract_song_features("genres/" + test_genre + "/" + test_genre + ".00000.wav")

# reshape for input
test_in = own_audio[np.newaxis,:]
test_in.shape

(1, 25000)

In [47]:
# now predict on test in
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = model.predict(test_in, verbose=0)

#np.set_printoptions(precision=2)
print(GENRES)
print(score[-1])

['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
[1.0825003e-02 9.0991771e-01 3.7754679e-03 1.9286338e-02 5.9297367e-04
 1.8619577e-03 1.6781920e-05 3.7434977e-03 4.1233424e-02 8.7468550e-03]


In [49]:
# now output the name
max_class = score.argmax(axis=-1)
print(f"real genre: {test_genre}\npredicted genre: {GENRES[max_class[0]]}")

real genre: blues
predicted genre: classical


# Classifying genre with support vector machine

Replace the straightforward neural network with a SVM with RBF kernel

In [130]:
# store features and labels
features = np.stack(all_features)
labels = onehot_labels

# last column has genre, turn it into unique ids
all_data = np.column_stack((features, labels))

# split into training and test data
train_input, train_labels, test_input, test_labels = split_train_test(all_data, n_classes=3)
svm_labels = np.array([np.where(r==1)[0][0] for r in train_labels])

print(np.shape(features))
print(np.shape(labels))

(300, 25000)
(300, 3)


In [131]:
# create the svm model
def build_svm():
    clf = svm.SVC(kernel='rbf',C=1.0)
    return clf

print(np.shape(train_labels))

# train the svm
classifier = build_svm()
classifier.fit(train_input, svm_labels);

(240, 3)


In [134]:
# Predict datasetTest with svm
test_genre = "jazz"

# either use studio or recorded audio
audio = extract_song_features("../recordings/output.wav")
#audio = extract_song_features("genres/" + test_genre + "/" + test_genre + ".00000.wav")

test_in = audio[np.newaxis,:]
predictY = classifier.predict(test_in)
GENRES[predictY[0]]

'jazz'

### Cross Validation

In [132]:
# pre processing
scaler = StandardScaler()
scaler.fit(features)
X = scaler.transform(features)
y = np.array([np.where(r==1)[0][0] for r in labels])
X.shape, y.shape

((300, 25000), (300,))

In [133]:
score = []
rates = []
# KFold
kf = KFold(n_splits=10, shuffle=True)
for train_index, test_index in kf.split(X):
    # Split data to train and test set
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train
    clf.fit(X_train, y_train)

    # Print accuracy
    score.append(clf.score(X_test, y_test))
    print("Accuracy: \t" + str(clf.score(X_test, y_test)))

    # I make the predictions
    predicted = clf.predict(X_test)

    # I obtain the confusion matrix
    cm = confusion_matrix(y_test, predicted)

    # rate calculation
    tp_rate = []
    i = 0
    for row in cm:
        current = 0
        TP = 0
        FP = 0
        for g in row:
            if current == i:
                TP = g
            else:
                FP = FP + g
            current = current + 1
        tp_rate.append(TP / (TP + FP))
        i = i + 1
    rates.append(tp_rate)

rates = np.round(np.mean(rates, axis=0), 3)
print("")
print("accuracy mean:", np.mean(score))
i = 0
for r in rates:
    print(GENRES[i], r)
    i = i + 1

Accuracy: 	0.8
Accuracy: 	0.9333333333333333
Accuracy: 	0.9333333333333333
Accuracy: 	0.8
Accuracy: 	0.9666666666666667
Accuracy: 	0.8
Accuracy: 	0.8
Accuracy: 	0.9
Accuracy: 	0.8666666666666667
Accuracy: 	0.9333333333333333

accuracy mean: 0.8733333333333334
classical 0.888
jazz 0.764
metal 0.982


[-0.82804605 -0.90328598 -1.         ... -0.33702131 -0.23553459
 -0.19476902] 1


# ConvNet
use convolutional network to predict the genre

In [None]:
    nn_model = Sequential()
    nn_model.add(Conv2D(100, 3, input_shape=input_shape))
    nn_model.add(AveragePooling2D((2, 2), strides=(2,2)))
    nn_model.add(Activation('relu'))
    nn_model.add(Flatten())
    nn_model.add(Dropout(rate=0.5))
    nn_model.add(Dense(100,input_shape=input_shape))
    nn_model.add(Activation('relu'))
    nn_model.add(Dropout(rate=0.5))
    nn_model.add(Dense(10))
    nn_model.add(Activation('softmax'))