In [6]:
########################################################
#
# import requirements
#
########################################################
import logging
import os
import librosa
import math
import os
import re

from random import shuffle

import numpy as np

from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split

In [7]:
class FeaturesExtraction:

    "Music audio features for genre classification"
    hop_length = None
    genre_list = [
        "blues",
        "classical",
        "country",
        "disco",
        "hiphop",
        "jazz",
        "metal",
        "pop",
        "reggae",
        "rock",
        "bamileke",
        "bikutsi",
        "makossa",
        "salsa",
        "zouk"
    ]

    #ORIGINAL GTZAN AUDIO SAMPLES
    dir_blues = "./gtzan/gtzan+/gtzan/blues"
    dir_classical = "./gtzan/gtzan+/gtzan/classical"
    dir_country = "./gtzan/gtzan+/gtzan/country"
    dir_disco = "./gtzan/gtzan+/gtzan/disco"
    dir_hiphop = "./gtzan/gtzan+/gtzan/hiphop"
    dir_jazz = "./gtzan/gtzan+/gtzan/jazz"
    dir_metal = "./gtzan/gtzan+/gtzan/metal"
    dir_pop = "./gtzan/gtzan+/gtzan/pop"
    dir_reggae = "./gtzan/gtzan+/gtzan/reggae"
    dir_rock = "./gtzan/gtzan+/gtzan/rock"

    #AFRO EXTENDED GTZAN AUDIO SAMPLES
    dir_bamileke = "./gtzan/gtzan+/afro/bamileke"
    dir_bikutsi = "./gtzan/gtzan+/afro/bikutsi"
    dir_makossa = "./gtzan/gtzan+/afro/makossa"
    dir_salsa = "./gtzan/gtzan+/afro/salsa"
    dir_zouk = "./gtzan/gtzan+/afro/zouk"



    #dir_trainfolder = "./gtzan/_train"
    #dir_devfolder = "./gtzan/_validation"
    #dir_testfolder = "./gtzan/_test"
    #dir_all_files = "./gtzan"

    train_X_preprocessed_data = "./gtzan/data_train_input.npy"
    train_Y_preprocessed_data = "./gtzan/data_train_target.npy"
    dev_X_preprocessed_data = "./gtzan/data_validation_input.npy"
    dev_Y_preprocessed_data = "./gtzan/data_validation_target.npy"
    test_X_preprocessed_data = "./gtzan/data_test_input.npy"
    test_Y_preprocessed_data = "./gtzan/data_test_target.npy"

    train_X = train_Y = None
    dev_X = dev_Y = None
    test_X = test_Y = None

    def __init__(self):
        self.hop_length = 512

        self.timeseries_length_list = []
       # self.trainfiles_list = self.path_to_audiofiles(self.dir_trainfolder)
       # self.devfiles_list = self.path_to_audiofiles(self.dir_devfolder)
       # self.testfiles_list = self.path_to_audiofiles(self.dir_testfolder)

        self.all_files_list = []

        self.all_files_list.extend(self.path_to_audiofiles(self.dir_blues))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_classical))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_country))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_disco))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_hiphop))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_jazz))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_metal))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_pop))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_reggae))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_rock))

        self.all_files_list.extend(self.path_to_audiofiles(self.dir_bamileke))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_bikutsi))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_makossa))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_salsa))
        self.all_files_list.extend(self.path_to_audiofiles(self.dir_zouk))

        shuffle(self.all_files_list)

        self.trainfiles_list, self.testfiles_list, self.trainfiles_list, self.testfiles_list = train_test_split(self.all_files_list, self.all_files_list, test_size=0.2, random_state=1)
        self.trainfiles_list, self.devfiles_list, self.trainfiles_list, self.devfiles_list = train_test_split(self.trainfiles_list, self.trainfiles_list, test_size=0.25, random_state=1)

        
        self.all_files_list = []
        
        
        self.all_files_list.extend(self.trainfiles_list)
        self.all_files_list.extend(self.devfiles_list)
        self.all_files_list.extend(self.testfiles_list)
    
        self.timeseries_length = (
            128
        )   # sequence length == 128, default fftsize == 2048 & hop == 512 @ SR of 22050
        #  equals 128 overlapped windows that cover approx ~3.065 seconds of audio, which is a bit small!

    def load_preprocess_data(self):
        print("[DEBUG] total number of files: " + str(len(self.timeseries_length_list)))

        # Training set
        self.train_X, self.train_Y = self.extract_audio_features(self.trainfiles_list)
        
        print("\nTRAIN_X array:\n")
        print(self.train_X)
        
        print("\nTRAIN_Y array:\n")
        print(self.train_Y)
        
        
        with open(self.train_X_preprocessed_data, "wb") as f:
            np.save(f, self.train_X)
            
        with open(self.train_Y_preprocessed_data, "wb") as f:
            self.train_Y = self.one_hot(self.train_Y)
            np.save(f, self.train_Y)

        # Validation set
        self.dev_X, self.dev_Y = self.extract_audio_features(self.devfiles_list)
        with open(self.dev_X_preprocessed_data, "wb") as f:
            np.save(f, self.dev_X)
        with open(self.dev_Y_preprocessed_data, "wb") as f:
            self.dev_Y = self.one_hot(self.dev_Y)
            np.save(f, self.dev_Y)

        # Test set
        self.test_X, self.test_Y = self.extract_audio_features(self.testfiles_list)
        with open(self.test_X_preprocessed_data, "wb") as f:
            np.save(f, self.test_X)
        with open(self.test_Y_preprocessed_data, "wb") as f:
            self.test_Y = self.one_hot(self.test_Y)
            np.save(f, self.test_Y)

    def load_deserialize_data(self):

        self.train_X = np.load(self.train_X_preprocessed_data)
        self.train_Y = np.load(self.train_Y_preprocessed_data)

        self.dev_X = np.load(self.dev_X_preprocessed_data)
        self.dev_Y = np.load(self.dev_Y_preprocessed_data)

        self.test_X = np.load(self.test_X_preprocessed_data)
        self.test_Y = np.load(self.test_Y_preprocessed_data)

    def precompute_min_timeseries_len(self):
        for file in self.all_files_list:
            print("Loading " + str(file))
            y, sr = librosa.load(file)
            self.timeseries_length_list.append(math.ceil(len(y) / self.hop_length))

    def extract_audio_features(self, list_of_audiofiles):

        data = np.zeros(
            (len(list_of_audiofiles), self.timeseries_length, 33), dtype=np.float64
        )
        target = []

        for i, file in enumerate(list_of_audiofiles):
            y, sr = librosa.load(file)
            
            #MFCC selection
            mfcc = librosa.feature.mfcc(
                y=y, sr=sr, hop_length=self.hop_length, n_mfcc=13
            )
            
            #SPECTRAL_CENTER selection
            spectral_center = librosa.feature.spectral_centroid(
                y=y, sr=sr, hop_length=self.hop_length
            )
            
            #CHROMA selection
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
            
            #SPECTRAL CONTRAST selection
            spectral_contrast = librosa.feature.spectral_contrast(
                y=y, sr=sr, hop_length=self.hop_length
            )

            splits = re.split("[ .]", file)
            genre = re.split("[ /]", splits[1])[3]
            target.append(genre)

            # MFCC Extraction
            data[i, :, 0:13] = mfcc.T[0:self.timeseries_length, :]
            
            #SPECTRAL_CENTER Extraction
            data[i, :, 13:14] = spectral_center.T[0:self.timeseries_length, :]
            
            #CHROMA Extraction
            data[i, :, 14:26] = chroma.T[0:self.timeseries_length, :]
            
            #SPECTRAL_CONTRAST Extraction
            data[i, :, 26:33] = spectral_contrast.T[0:self.timeseries_length, :]

            print(
                "Extracted features audio track %i of %i."
                % (i + 1, len(list_of_audiofiles))
            )

        return data, np.expand_dims(np.asarray(target), axis=1)

    # ONE HOT ENCODING
    def one_hot(self, Y_genre_strings):
        y_one_hot = np.zeros((Y_genre_strings.shape[0], len(self.genre_list)))
        for i, genre_string in enumerate(Y_genre_strings):
            print(genre_string)
            index = self.genre_list.index(genre_string)
            y_one_hot[i, index] = 1
        return y_one_hot

    @staticmethod
    def path_to_audiofiles(dir_folder):
        list_of_audio = []
        for file in os.listdir(dir_folder):
            if file.endswith(".au") or file.endswith(".wav"):
                directory = "%s/%s" % (dir_folder, file)
                list_of_audio.append(directory)
        return list_of_audio


In [8]:
# set logging level
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [9]:
########################################################
#
# Check on pre-processed files. It generates those files and if they already exists use them.
#
########################################################
genre_features = FeaturesExtraction()

if (
    os.path.isfile(genre_features.train_X_preprocessed_data)
    and os.path.isfile(genre_features.train_Y_preprocessed_data)
    and os.path.isfile(genre_features.dev_X_preprocessed_data)
    and os.path.isfile(genre_features.dev_Y_preprocessed_data)
    and os.path.isfile(genre_features.test_X_preprocessed_data)
    and os.path.isfile(genre_features.test_Y_preprocessed_data)
):
    print("Preprocessed files exist, deserializing npy files")
    genre_features.load_deserialize_data()
else:
    print("Preprocessing raw audio files")
    genre_features.load_preprocess_data()

print("Training X shape: " + str(genre_features.train_X.shape))
print("Training Y shape: " + str(genre_features.train_Y.shape))
print("Dev X shape: " + str(genre_features.dev_X.shape))
print("Dev Y shape: " + str(genre_features.dev_Y.shape))
print("Test X shape: " + str(genre_features.test_X.shape))
print("Test Y shape: " + str(genre_features.test_Y.shape))

input_shape = (genre_features.train_X.shape[1], genre_features.train_X.shape[2])


Preprocessing raw audio files
[DEBUG] total number of files: 0
Extracted features audio track 1 of 900.
Extracted features audio track 2 of 900.
Extracted features audio track 3 of 900.
Extracted features audio track 4 of 900.
Extracted features audio track 5 of 900.
Extracted features audio track 6 of 900.
Extracted features audio track 7 of 900.
Extracted features audio track 8 of 900.
Extracted features audio track 9 of 900.
Extracted features audio track 10 of 900.
Extracted features audio track 11 of 900.
Extracted features audio track 12 of 900.
Extracted features audio track 13 of 900.
Extracted features audio track 14 of 900.
Extracted features audio track 15 of 900.
Extracted features audio track 16 of 900.
Extracted features audio track 17 of 900.
Extracted features audio track 18 of 900.
Extracted features audio track 19 of 900.
Extracted features audio track 20 of 900.
Extracted features audio track 21 of 900.
Extracted features audio track 22 of 900.
Extracted features aud

Extracted features audio track 193 of 900.
Extracted features audio track 194 of 900.
Extracted features audio track 195 of 900.
Extracted features audio track 196 of 900.
Extracted features audio track 197 of 900.
Extracted features audio track 198 of 900.
Extracted features audio track 199 of 900.
Extracted features audio track 200 of 900.
Extracted features audio track 201 of 900.
Extracted features audio track 202 of 900.
Extracted features audio track 203 of 900.
Extracted features audio track 204 of 900.
Extracted features audio track 205 of 900.
Extracted features audio track 206 of 900.
Extracted features audio track 207 of 900.
Extracted features audio track 208 of 900.
Extracted features audio track 209 of 900.
Extracted features audio track 210 of 900.
Extracted features audio track 211 of 900.
Extracted features audio track 212 of 900.
Extracted features audio track 213 of 900.
Extracted features audio track 214 of 900.
Extracted features audio track 215 of 900.
Extracted f

Extracted features audio track 384 of 900.
Extracted features audio track 385 of 900.
Extracted features audio track 386 of 900.
Extracted features audio track 387 of 900.
Extracted features audio track 388 of 900.
Extracted features audio track 389 of 900.
Extracted features audio track 390 of 900.
Extracted features audio track 391 of 900.
Extracted features audio track 392 of 900.
Extracted features audio track 393 of 900.
Extracted features audio track 394 of 900.
Extracted features audio track 395 of 900.
Extracted features audio track 396 of 900.
Extracted features audio track 397 of 900.
Extracted features audio track 398 of 900.
Extracted features audio track 399 of 900.
Extracted features audio track 400 of 900.
Extracted features audio track 401 of 900.
Extracted features audio track 402 of 900.
Extracted features audio track 403 of 900.
Extracted features audio track 404 of 900.
Extracted features audio track 405 of 900.
Extracted features audio track 406 of 900.
Extracted f

Extracted features audio track 575 of 900.
Extracted features audio track 576 of 900.
Extracted features audio track 577 of 900.
Extracted features audio track 578 of 900.
Extracted features audio track 579 of 900.
Extracted features audio track 580 of 900.
Extracted features audio track 581 of 900.
Extracted features audio track 582 of 900.
Extracted features audio track 583 of 900.
Extracted features audio track 584 of 900.
Extracted features audio track 585 of 900.
Extracted features audio track 586 of 900.
Extracted features audio track 587 of 900.
Extracted features audio track 588 of 900.
Extracted features audio track 589 of 900.
Extracted features audio track 590 of 900.
Extracted features audio track 591 of 900.
Extracted features audio track 592 of 900.
Extracted features audio track 593 of 900.
Extracted features audio track 594 of 900.
Extracted features audio track 595 of 900.
Extracted features audio track 596 of 900.
Extracted features audio track 597 of 900.
Extracted f

Extracted features audio track 766 of 900.
Extracted features audio track 767 of 900.
Extracted features audio track 768 of 900.
Extracted features audio track 769 of 900.
Extracted features audio track 770 of 900.
Extracted features audio track 771 of 900.
Extracted features audio track 772 of 900.
Extracted features audio track 773 of 900.
Extracted features audio track 774 of 900.
Extracted features audio track 775 of 900.
Extracted features audio track 776 of 900.
Extracted features audio track 777 of 900.
Extracted features audio track 778 of 900.
Extracted features audio track 779 of 900.
Extracted features audio track 780 of 900.
Extracted features audio track 781 of 900.
Extracted features audio track 782 of 900.
Extracted features audio track 783 of 900.
Extracted features audio track 784 of 900.
Extracted features audio track 785 of 900.
Extracted features audio track 786 of 900.
Extracted features audio track 787 of 900.
Extracted features audio track 788 of 900.
Extracted f

ValueError: array(['gtzan'], dtype='<U5') is not in list

In [None]:
########################################################
#
# Long short-term memory RNN using Adam optimization
#
########################################################

print("Build LSTM RNN model ...")
model = Sequential()

model.add(LSTM(units=128, dropout=0.05, recurrent_dropout=0.35, return_sequences=True, input_shape=input_shape))
model.add(LSTM(units=32,  dropout=0.05, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units=genre_features.train_Y.shape[1], activation="softmax"))

print("Compiling model")
opt = Adam()
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.summary()

In [None]:
########################################################
#
# Training
# Batch size = 35
# Epochs = 400
#
########################################################

print("Training ...")
batch_size = 35
num_epochs = 400
model.fit(
    genre_features.train_X,
    genre_features.train_Y,
    batch_size=batch_size,
    epochs=num_epochs,
)

In [None]:
########################################################
#
# Validation
#
########################################################

print("Validating ...")
score, accuracy = model.evaluate(
    genre_features.dev_X, genre_features.dev_Y, batch_size=batch_size, verbose=1
)
print("Dev loss:  ", score)
print("Dev accuracy:  ", accuracy)


In [None]:
########################################################
#
# Testing
#
########################################################

print("Testing ...")
score, accuracy = model.evaluate(
    genre_features.test_X, genre_features.test_Y, batch_size=batch_size, verbose=1
)
print("Test loss:  ", score)
print("Test accuracy:  ", accuracy)

In [None]:
########################################################
#
# Store HDF5 file model
#
########################################################

model_filename = "lstm_genre_classifier_lstm.h5"
print("Saving model: " + model_filename)
model.save(model_filename)    

In [None]:
########################################################
#
# Prediction test
#
########################################################
import sys

def load_model(model_path, weights_path):
    "Load the trained LSTM model from directory for genre classification"
    with open(model_path, "r") as model_file:
        trained_model = model_from_json(model_file.read())
    trained_model.load_weights(weights_path)
    trained_model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return trained_model


def extract_audio_features(file):
    "Extract audio features from an audio file for genre classification"
    timeseries_length = 128
    features = np.zeros((1, timeseries_length, 33), dtype=np.float64)

    y, sr = librosa.load(file)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=512, n_mfcc=13)
    spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=512)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=512)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=512)

    features[0, :, 0:13] = mfcc.T[0:timeseries_length, :]
    features[0, :, 13:14] = spectral_center.T[0:timeseries_length, :]
    features[0, :, 14:26] = chroma.T[0:timeseries_length, :]
    features[0, :, 26:33] = spectral_contrast.T[0:timeseries_length, :]
    return features


def get_genre(model, music_path):
    "Predict genre of music using a trained model"
    prediction = model.predict(extract_audio_features(music_path))
    predict_genre = FeaturesExtraction().genre_list[np.argmax(prediction)]
    return predict_genre


if __name__ == "__main__":
    #PATH = sys.argv[1] if len(sys.argv) == 2 else "./audio/classical_music.mp3"
    PATH = "./audio/"
    f = []
    for (dirpath, dirnames, filenames) in os.walk(PATH):
        f.extend(filenames)
        break
    
    for f in filenames:
        print("Trying to predict the genre of "+f)
        GENRE = get_genre(model, PATH+f)
        print("Model predict: {}".format(GENRE))
    