In [1]:
!wget -O genres.tar.gz https://www.dropbox.com/s/4jw31k5mlzcmgis/genres.tar.gz?dl=0
!tar -xzvf genres.tar.gz
!rm genres.tar.gz

--2024-07-01 11:30:27--  https://www.dropbox.com/s/4jw31k5mlzcmgis/genres.tar.gz?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6017:18::a27d:212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/logv7hsjc1t3daaysuhyh/genres.tar.gz?rlkey=vcu3jvliyletkjwxz8bv8jm7c&dl=0 [following]
--2024-07-01 11:30:28--  https://www.dropbox.com/scl/fi/logv7hsjc1t3daaysuhyh/genres.tar.gz?rlkey=vcu3jvliyletkjwxz8bv8jm7c&dl=0
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca3dd192612bcebba675ad771ca.dl.dropboxusercontent.com/cd/0/inline/CV1l9cBX0vsKZf4IImUR-6vcmtmEwxA2yYSn4bB-2Kz79E_GvE5meG-PKsCWbMh_NRbSvvnFJhRcaShedOA5o4yd1wBVWHJAQVplJKGVnEmYUry6gOiCtfkxAclEo7lqi9Y/file# [following]
--2024-07-01 11:30:29--  https://uca3dd192612bcebba675ad771ca.dl.dropboxusercontent.com/cd/0/inline/CV

In [2]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy import interpolate
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, classification_report, confusion_matrix, accuracy_score
from keras.models import Model
from keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling2D
from keras.applications import DenseNet201
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.utils import to_categorical
from tqdm import tqdm

In [3]:
HOME_FOLDER= "/content/genres"
SAMPLE_RATE= 22050
N_MELS= 128
N_FFT= 2048
HOP_LENGTH= 512
BATCH_SIZE= 10
EPOCHS= 30
IMG_HEIGHT= 224
IMG_WIDTH= 224

In [4]:
# Load audio files and labels

audio_files= []
labels= []

for genre in os.listdir(HOME_FOLDER):
  genre_folder= os.path.join(HOME_FOLDER, genre)

  for file in os.listdir(genre_folder):
    audio_file= os.path.join(genre_folder, file)
    audio_files.append(audio_file)
    labels.append(genre)

In [5]:
# One-hot encode labels

le = LabelEncoder()
labels_encoded= le.fit_transform(labels)
labels_onehot= to_categorical(labels_encoded)

labels_onehot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [6]:
# Function to extract spectogram having fixed shape

def spectrogram_extractor(audio_files, labels):
  spectrograms= []
  frequency= []
  time_frame= []
  updated_label= []

  for file in tqdm(audio_files, desc= "Extracting spectrograms", unit= "Audio"):
    try:
      audio, sr= librosa.load(file, sr= SAMPLE_RATE)
      spectrogram= librosa.feature.melspectrogram(y= audio, sr= sr, n_mels= N_MELS, n_fft= N_FFT, hop_length= HOP_LENGTH)
      spectrogram= librosa.power_to_db(spectrogram, ref= np.max)
      spectrograms.append(spectrogram)
      frequency, time_frame= spectrogram.shape
      updated_label.append(labels[audio_files.index(file)])
    except:
      print("Failed to extract feature")

  resized_shape= (int(np.quantile(frequency, 0.95)), int(np.quantile(time_frame, 0.95)))
  resized_spectrograms = []

  for item in spectrograms:
    frequency_ratio= item.shape[0] / resized_shape[0]
    time_ratio= item.shape[1] / resized_shape[1]

    x_freq= np.linspace(0, item.shape[0] - 1, item.shape[0])
    y_time= np.linspace(0, item.shape[1] - 1, item.shape[1])

    x_new_freq= np.linspace(0, item.shape[0] - 1, resized_shape[0])
    y_new_time= np.linspace(0, item.shape[1] - 1, resized_shape[1])

    f_freq= interpolate.interp1d(x_freq, item, axis= 0, kind= 'linear', fill_value= "extrapolate")
    interpolated_spectrogram_freq= f_freq(x_new_freq)

    f_time= interpolate.interp1d(y_time, interpolated_spectrogram_freq, axis= 1, kind= 'linear', fill_value= "extrapolate")
    resized_spectrogram= f_time(y_new_time)

    resized_spectrograms.append(resized_spectrogram)

  return np.array(resized_spectrograms), np.array(updated_label), resized_shape

In [7]:
X, Y, INPUT_SHAPE= spectrogram_extractor(audio_files, labels_onehot)

Extracting spectrograms: 100%|██████████| 1000/1000 [01:16<00:00, 13.13Audio/s]


In [8]:
len(X), len(Y), INPUT_SHAPE

(1000, 1000, (128, 1293))

In [9]:
INPUT_SHAPE= INPUT_SHAPE + (3,)
INPUT_SHAPE

(128, 1293, 3)

In [10]:
def data_generator(features, labels, batch_size):
  num_samples= features.shape[0]
  indices= np.arange(num_samples)
  while True:
    np.random.shuffle(indices)

    for i in range(0, num_samples, batch_size):
      batch_indices= indices[i:i + batch_size]
      features_batch= features[batch_indices]
      labels_batch= labels[batch_indices]

      features_batch= np.expand_dims(features_batch, axis= -1)
      features_batch= np.repeat(features_batch, 3, axis= -1)

      yield features_batch, labels_batch

In [11]:
# Split data into training, validation, and testing sets

train_audio_files, val_test_audio_files, train_labels, val_test_labels= train_test_split(X, Y, test_size= 0.3, stratify= Y)
val_audio_files, test_audio_files, val_labels, test_labels= train_test_split(val_test_audio_files, val_test_labels, test_size= 0.7, stratify= val_test_labels)

In [12]:
# Create generators for training, validation, and testing

train_generator= data_generator(train_audio_files, train_labels, BATCH_SIZE)
val_generator= data_generator(val_audio_files, val_labels, BATCH_SIZE)

In [13]:
# Model based on DenseNet201 architecture
base_model= DenseNet201(weights= 'imagenet', include_top= False, input_shape= INPUT_SHAPE)
base_model.trainable= False
x= base_model.output
x= GlobalAveragePooling2D()(x)
x= Flatten()(x)
x= Dense(1024, activation= 'relu')(x)
x= Dropout(0.2)(x)
x= Dense(512, activation= 'relu')(x)
x= Dropout(0.2)(x)
output= Dense(len(le.classes_), activation= 'softmax', name= 'classifier')(x)

model= Model(inputs= base_model.input, outputs= output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5


In [14]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
train_steps_per_epoch= np.ceil(len(train_audio_files) / BATCH_SIZE)
val_steps_per_epoch= np.ceil(len(val_audio_files) / BATCH_SIZE)

In [16]:
#  Setting up callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001)
earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0.0001, patience = 5, verbose = 1, restore_best_weights=True)

callbacks= [reduce_lr, earlystopping]

In [17]:
model.fit(train_generator,
          epochs= EPOCHS,
          steps_per_epoch= train_steps_per_epoch,
          validation_data= val_generator,
          validation_steps= val_steps_per_epoch,
          verbose= 1,
          callbacks= callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 20: early stopping


<keras.src.callbacks.History at 0x7f1c5812fe80>

In [20]:
model.fit(train_generator,
          epochs= 10,
          steps_per_epoch= train_steps_per_epoch,
          validation_data= val_generator,
          validation_steps= val_steps_per_epoch,
          verbose= 1,
          callbacks= [reduce_lr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f1be4386050>

In [21]:
# Predicting on test data

test_spectrograms= np.expand_dims(test_audio_files, axis=-1)
test_spectrograms= np.repeat(test_spectrograms, 3, axis=-1)


test_pred = model.predict(test_spectrograms)
test_pred_class = np.argmax(test_pred, axis=1)
test_labels_class = np.argmax(test_labels, axis=1)



In [22]:
# Calculate accuracy, f-beta score, classification report, and confusion matrix
accuracy = accuracy_score(test_labels_class, test_pred_class)
print(f"Test accuracy: {accuracy:.3f}")


fbeta = fbeta_score(test_labels_class, test_pred_class, beta=2, average='macro')
print('F-beta score:', fbeta)

print('Classification report:')
print(classification_report(test_labels_class, test_pred_class))

print('Confusion matrix:')
print(confusion_matrix(test_labels_class, test_pred_class))

Test accuracy: 0.748
F-beta score: 0.7433536635158846
Classification report:
              precision    recall  f1-score   support

           0       0.59      0.76      0.67        21
           1       0.87      0.95      0.91        21
           2       0.63      0.57      0.60        21
           3       0.65      0.81      0.72        21
           4       0.83      0.95      0.89        21
           5       0.68      0.62      0.65        21
           6       1.00      1.00      1.00        21
           7       0.90      0.86      0.88        21
           8       0.67      0.57      0.62        21
           9       0.62      0.38      0.47        21

    accuracy                           0.75       210
   macro avg       0.74      0.75      0.74       210
weighted avg       0.74      0.75      0.74       210

Confusion matrix:
[[16  0  1  0  0  1  0  0  1  2]
 [ 0 20  0  0  0  1  0  0  0  0]
 [ 1  0 12  1  0  3  0  1  2  1]
 [ 1  0  1 17  1  0  0  0  1  0]
 [ 0  0  0  1 