In [1]:
from tensorflow import keras
import numpy as np
import math
import os
from tensorflow import keras
import librosa.display
import librosa
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
DATASET_PATH ='Dataset/genres_original/'
SAMPLE_RATE = 22050
DURATION = 30
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION 

In [3]:
def save_mfcc(dataset_path  , n_mfcc = 13 , n_fft = 2048 , hop_length = 512 , num_segments = 5):
       """Extracts MFCCs from music dataset and saves them into variabel data
        :param dataset_path (str): Path to dataset
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
     
        """
    data = {
        # dictionary to store mapping, labels, and MFCCs
        'mapping' :[] ,
        'mfcc' :[] ,
        'labels' : []
    }
    num_sample_per_segment = int(SAMPLES_PER_TRACK / num_segments) 
    expected_num_mfcc_vector_per_segment = math.ceil(num_sample_per_segment / hop_length)
    print(f'{expected_num_mfcc_vector_per_segment} that is the length of the sequence')
    
    # dictionary to store mapping, labels, and MFCCs
    for i , (dirpath , dirnames , filenames) in enumerate(os.walk(dataset_path)):
        
        # ensure we're processing a genre sub-folder level
        if dirpath != dataset_path:
            
            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split('/')[-1]
            data['mapping'].append(semantic_label)
            
            # process all audio files in genre sub-dir
            for file in filenames:
                
                # load audio file
                file_path = os.path.join(dirpath , file)
                try :
                    signal , sr = librosa.load(file_path , sr = SAMPLE_RATE)
                    
                    # process all segments of audio file
                    for s in range(num_segments):
                        
                        # calculate start and finish sample for current segment
                        start_sample = num_sample_per_segment * s
                        finish_sample = start_sample + num_sample_per_segment
                        
                        # extract mfcc
                        mfcc = librosa.feature.mfcc(y = signal[start_sample:finish_sample] , sr = SAMPLE_RATE , 
                                                   n_mfcc=13 , n_fft = n_fft , hop_length = hop_length)
                        mfcc = mfcc.T
                        
                        # store only mfcc feature with expected number of vectors
                        if len(mfcc) == expected_num_mfcc_vector_per_segment:
                            data['mfcc'].append(mfcc.tolist())
                            data['labels'].append(i-1)
                except:
                    pass
                        
            print(f"{dirpath.split('/')[-1]} is loaded successfully")
                        
    return data

In [4]:
data_dict = save_mfcc(DATASET_PATH  , num_segments=10)

130 that is the length of the sequence
blues is loaded successfully
classical is loaded successfully
country is loaded successfully
disco is loaded successfully
hiphop is loaded successfully
jazz is loaded successfully
jazz\.ipynb_checkpoints is loaded successfully
metal is loaded successfully
pop is loaded successfully
reggae is loaded successfully
rock is loaded successfully


In [5]:
data = np.array(data_dict['mfcc'])
label = np.array(data_dict['labels']).reshape(-1 , 1)

In [6]:
data.shape

(9996, 130, 13)

In [7]:
label.shape

(9996, 1)

In [8]:
# create train, validation and test split
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test =train_test_split(data , label  ,test_size=0.2 ,shuffle= True)
X_train, X_valid, y_train, y_valid =train_test_split(X_train_val , y_train_val  ,test_size=0.2)

In [9]:
# build network topology
model = keras.models.Sequential([
    
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, 13]),
    keras.layers.LayerNormalization(),
    keras.layers.GRU(64, return_sequences=True),
    keras.layers.LayerNormalization(),
    keras.layers.GRU(32),
    
    keras.layers.Dense(11 ,  activation="softmax")
])

model.compile(loss ='sparse_categorical_crossentropy', optimizer = keras.optimizers.Adam() , metrics = ['accuracy'])

In [10]:
model.fit(X_train , y_train ,validation_data=(X_valid , y_valid) , epochs=30 , batch_size=8)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x202354ce7f0>

In [11]:
model.save('Music_genre_classification_GRU.h5')