In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

In [3]:
# Load metadata and prepare paths

audio_dataset_path = 'E:\\Users\\Sumit\\Downloads\\UrbanSound8K\\UrbanSound8K\\audio\\'
metadata = pd.read_csv('E:\\Users\\Sumit\\Downloads\\UrbanSound8K\\UrbanSound8K\\metadata\\UrbanSound8K.csv')

In [5]:
# Function to extract MFCC features

def features_extractor(file):
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs_scaled

In [11]:
# Extract features and labels

extracted_features = []
for index_num, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name)
    extracted_features.append([data, final_class_labels])



In [13]:
# Convert to DataFrame

extracted_features_df = pd.DataFrame(extracted_features, columns=['feature', 'class'])

In [15]:
# Prepare the data

X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

In [17]:
# Label encoding

labelencoder = LabelEncoder()
y = to_categorical(labelencoder.fit_transform(y))

In [19]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the data for CNN (CNNs expect 2D data, so we add extra dimensions)

X_train = X_train.reshape(X_train.shape[0], 40, 1, 1)
X_test = X_test.reshape(X_test.shape[0], 40, 1, 1)

In [35]:
model = Sequential()

# First convolutional layer
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', input_shape=(40, 1, 1)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Dropout(0.3))

# Second convolutional layer
model.add(Conv2D(128, kernel_size=(3, 3), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Dropout(0.3))

# Flatten the output for dense layers
model.add(Flatten())

# Dense layers
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Output layer (softmax for multi-class classification)
model.add(Dense(y.shape[1]))
model.add(Activation('softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [37]:
# Model summary
model.summary()

# Checkpoint to save the best model
checkpointer = ModelCheckpoint(filepath='E:\\Users\\Sumit\Downloads\\UrbanSound8K\\UrbanSound8K\\saved_models_CNN\\audio_classification_CNN.keras', verbose=1, save_best_only=True)

  checkpointer = ModelCheckpoint(filepath='E:\\Users\\Sumit\Downloads\\UrbanSound8K\\UrbanSound8K\\saved_models_CNN\\audio_classification_CNN.keras', verbose=1, save_best_only=True)


In [None]:
# Train the model
num_epochs = 100
num_batch_size = 32
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer])

duration = datetime.now() - start
print(f"Training completed in time: {duration}")

# Evaluate the model on test data
test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy[1]}")

Epoch 1/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2541 - loss: 2.4373
Epoch 1: val_loss improved from inf to 1.31112, saving model to E:\Users\Sumit\Downloads\UrbanSound8K\UrbanSound8K\saved_models_CNN\audio_classification_CNN.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.2545 - loss: 2.4350 - val_accuracy: 0.5467 - val_loss: 1.3111
Epoch 2/100
[1m217/219[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.4808 - loss: 1.5103
Epoch 2: val_loss improved from 1.31112 to 1.05077, saving model to E:\Users\Sumit\Downloads\UrbanSound8K\UrbanSound8K\saved_models_CNN\audio_classification_CNN.keras
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.4811 - loss: 1.5093 - val_accuracy: 0.6651 - val_loss: 1.0508
Epoch 3/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5614 - loss: 1.2835
Epoc