# Required libraries

In [5]:
import sys
import os
import IPython as IP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pickle
import helpers
import glob
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from IPython.display import clear_output, display
from scipy.stats import kurtosis, skew
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

# Loading Meta_Data

In [19]:
# Set your path to the dataset
# Load the metadata from the generated CSV
meta_data=pd.read_csv('C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/metadata/UrbanSound8K.csv')
audio_dataset_path='C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/audio/'
print(meta_data.shape)
meta_data.head(10)

(8732, 8)


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
5,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing
6,100263-2-0-161.wav,100263,80.5,84.5,1,5,2,children_playing
7,100263-2-0-3.wav,100263,1.5,5.5,1,5,2,children_playing
8,100263-2-0-36.wav,100263,18.0,22.0,1,5,2,children_playing
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn


# Method of  MFCC coefficients extraction

In [8]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(meta_data)
n_mfcc = 40

for index, row in meta_data.iterrows():
    file_path = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract MFCCs (do not add padding)
    mfccs = helpers.get_mfcc(file_path, 0, n_mfcc)
    
    # Save current frame count
    num_frames = mfccs.shape[1]
    
    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Status: 501/8732
Status: 1001/8732
Status: 1501/8732
Status: 2001/8732
Status: 2501/8732
Status: 3001/8732
Status: 3501/8732
Status: 4001/8732
Status: 4501/8732
Status: 5001/8732
Status: 5501/8732
Status: 6001/8732
Status: 6501/8732
Status: 7001/8732
Status: 7501/8732
Status: 8001/8732
Status: 8501/8732
Finished: 8731/8732


In [12]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mels_max_padding):
        pad_width = mels_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)

In [13]:
# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

# Save MFCC features

In [27]:
# Convert features (X) and labels (y) to Numpy arrays
X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/X-mfcc", X)
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/y-mfcc", y)

# Feature Extraction : Mel Spectogram

In [20]:
# Iterate through all audio files and extract mel spectogram
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(meta_data)
n_mels=40

for index, row in meta_data.iterrows():
    file_path = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract Log-Mel Spectrograms (do not add padding)
    mels = helpers.get_mel_spectrogram(file_path, 0, n_mels=n_mels)
    
    # Save current frame count
    num_frames = mels.shape[1]
    
    # Add row (feature / label)
    features.append(mels)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Status: 501/8732
Status: 1001/8732
Status: 1501/8732
Status: 2001/8732
Status: 2501/8732
Status: 3001/8732
Status: 3501/8732
Status: 4001/8732
Status: 4501/8732
Status: 5001/8732
Status: 5501/8732
Status: 6001/8732
Status: 6501/8732
Status: 7001/8732
Status: 7501/8732
Status: 8001/8732
Status: 8501/8732
Finished: 8731/8732


In [21]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mels_max_padding):
        pad_width = mels_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)

In [22]:
# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

In [25]:
## Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/X-mel_spec-augmented", X)
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/y-mel_spec-augmented", y)

# Feature Extraction : Chromagram

In [35]:
# Iterate through all audio files and extract chromagram
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(meta_data)
n_chroma = 40


#Iterating through all audio files and extracting chromagram
for index, row in meta_data.iterrows():
    file_path = os.path.join(os.path.abspath('UrbanSound8K/audio'), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]
    try:
        y, sr = librosa.load(file_path)
        normalized_y = librosa.util.normalize(y)
        chromagram = librosa.feature.chroma_stft(y=normalized_y,
                                        sr=sr)
        
        normalized_chroma = librosa.util.normalize(chromagram)
        shape = normalized_chroma.shape[1]
        chroma = normalized_chroma
        num_frames = chroma.shape[1]
        features.append(chroma)
        labels.append(class_label)
        if (num_frames > frames_max):
            frames_max = num_frames
        if (counter == 1):
            print("Status: {}/{}".format(index+1, total_samples))
            counter = 0
        counter += 1
    except Exception:
        pass
print("Finished: {}/{}".format(index, total_samples))
    

Finished: 8731/8732


## Padding the feature variable

In [36]:
# Given an numpy array of features, zero-pads each ocurrence to max_padding
def add_padding(features, chroma_max_padding=174):
    padded = []
    for i in range(len(features)):
        px = features[i]
        size = len(px[0])
        # Add padding if required
        if (size < chroma_max_padding):
            xDiff = chroma_max_padding - size
            xLeft = xDiff//2
            xRight = xDiff-xLeft
            px = np.pad(px, pad_width=((0,0), (xLeft, xRight)), mode='constant')
        
        padded.append(px)

    return padded

In [37]:
# Add padding to features with less than frames than frames_max
padded_features = add_padding(features, frames_max)

In [38]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded) #Padded Feature are converted to numpy array & stored in X
y = np.array(labels) #Labels are coverted to numpy array & stored in y

# Optionally save the features to disk
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/X-Chromo", X)
np.save("C:/Users/VSBAG/Desktop/DSE_Milan/3rd_sem_subject/Machine Learning/Project/Sound_classification/UrbanSound8K/extracted_features/y-Chromo", y)