### Model 1 - Enviormental sound classifiers

In [46]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [47]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import librosa
import pandas as pd
import numpy as np

import os
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Input
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.utils import to_categorical
from sklearn import metrics 
# Load metadata
metadata = pd.read_csv('C:/Users/zzzl0/Desktop/predicting-and-avoiding-dog-barking-behaviour/predicting-and-avoiding-dog-barking-behaviour/UrbanSound8K/metadata/model1.csv')

In [69]:
# mfcc
max_frames = 174
def mfcc_mels(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mels = librosa.feature.melspectrogram(y=audio, sr=sample_rate)

    # Padding
    pad_width = max_frames - mfccs.shape[1]
    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    pad_width_mels = max_frames - mels.shape[1]
    mels = np.pad(mels, pad_width=((0, 0), (0, pad_width_mels)), mode='constant')

    return mfccs, mels


In [54]:
# cnn
def cnn(num_rows=40, num_columns=174, num_labels=10):
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, 1), activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(GlobalAveragePooling2D())
    model.add(Dense(num_labels, activation='softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    return model

In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
import random
def display_sample_data(features):
    random_index = np.random.randint(0, len(features))
    random_mfcc = features[random_index][0]
    
    # Remove the extra dimension if present
    if random_mfcc.ndim > 1:
        random_mfcc = random_mfcc.squeeze()
    
    random_mfcc = np.transpose(random_mfcc)  # Transpose the data
    
    plt.figure(figsize=(10, 4))
    sns.heatmap(random_mfcc, cmap='viridis')
    plt.title('MFCC')
    plt.show()

In [70]:
# extract feature
for idx, row in metadata.iterrows():
    filename = os.path.join('C:/Users/zzzl0/Desktop/predicting-and-avoiding-dog-barking-behaviour/predicting-and-avoiding-dog-barking-behaviour/UrbanSound8K/audio',
                            'fold' + str(row['fold']), row['slice_file_name'])
    mfccs, mels = mfcc_mels(filename)
    mels_db = librosa.power_to_db(mels, ref=np.max)
    
    # Save MFCC and mel spectrogram as numpy arrays
    np.save(f'mfcc/{row["slice_file_name"]}.npy', mfccs)
    np.save(f'spectrograms/{row["slice_file_name"]}.npy', mels_db)

In [None]:
def multi_input_cnn(input_shape_mfcc, input_shape_mels, num_classes):
    # mfcc
    input_mfcc = Input(shape=input_shape_mfcc)
    x = Conv2D(32, kernel_size=(3, 3), activation='relu')(input_mfcc)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Flatten()(x)

    # mels
    input_mels = Input(shape=input_shape_mels)
    y = Conv2D(32, kernel_size=(3, 3), activation='relu')(input_mels)
    y = MaxPooling2D(pool_size=(2, 2))(y)
    y = Flatten()(y)

    combined = concatenate([x, y])

    z = Dense(64, activation='relu')(combined)
    z = Dense(num_classes, activation='softmax')(z)

    model = Model(inputs=[input_mfcc, input_mels], outputs=z)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [68]:
fold_accuracies = []

for i in range(1, 11):  # 10-fold
    print(f"Processing fold {i}")
    test_data = metadata[metadata['fold'] == i]
    train_data = metadata[metadata['fold'] != i]

    X_train_mfcc = np.array([np.load(f'mfcc/{filename}.npy') for filename in train_data['slice_file_name']])
    X_test_mfcc = np.array([np.load(f'mfcc/{filename}.npy') for filename in test_data['slice_file_name']])
    X_train_mels = np.array([np.load(f'spectrograms/{filename}.npy') for filename in train_data['slice_file_name']])
    X_test_mels = np.array([np.load(f'spectrograms/{filename}.npy') for filename in test_data['slice_file_name']])

    y_train = np.array(train_data.classID.tolist())
    y_test = np.array(test_data.classID.tolist())

    le = LabelEncoder()
    y_train = to_categorical(le.fit_transform(y_train))
    y_test = to_categorical(le.fit_transform(y_test))

    X_train_mfcc = X_train_mfcc.reshape(list(X_train_mfcc.shape) + [1])
    X_test_mfcc = X_test_mfcc.reshape(list(X_test_mfcc.shape) + [1])
    X_train_mels = X_train_mels.reshape(list(X_train_mels.shape) + [1])
    X_test_mels = X_test_mels.reshape(list(X_test_mels.shape) + [1])

    model = multi_input_cnn(X_train_mfcc.shape[1:], X_train_mels.shape[1:], y_train.shape[1])

    model.fit([X_train_mfcc, X_train_mels], y_train, epochs=50, batch_size=256, verbose=1)

    predictions = model.predict([X_test_mfcc, X_test_mels])
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(y_test, axis=1)

    fold_accuracy = accuracy_score(true_classes, predicted_classes)
    print(f"Accuracy for fold {i}: {fold_accuracy}")
    fold_accuracies.append(fold_accuracy)

print(f"10-fold cross validation accuracy: {np.mean(fold_accuracies)}")


Processing fold 1


  X_train_mels = np.array([np.load(f'spectrograms/{filename}.npy') for filename in train_data['slice_file_name']])


ValueError: could not broadcast input array from shape (128,14) into shape (128,)

In [51]:
"""fold_accuracies = []

# Loading MFCC features from the CSV file
features_df = pd.read_csv('mfcc.csv')

for i in range(1, 11):  # 10-fold
    print(f"Processing fold {i}")
    test_data = metadata[metadata['fold'] == i]
    train_data = metadata[metadata['fold'] != i]

    X_train_mfcc = np.array([np.load(f'mfcc/{filename}.npy') for filename in train_data['slice_file_name']])
    X_test_mfcc = np.array([np.load(f'mfcc/{filename}.npy') for filename in test_data['slice_file_name']])
    X_train_mels = np.array([np.load(f'spectrograms/{filename}.npy') for filename in train_data['slice_file_name']])
    X_test_mels = np.array([np.load(f'spectrograms/{filename}.npy') for filename in test_data['slice_file_name']])

    y_train = np.array(train_data.classID.tolist())
    y_test = np.array(test_data.classID.tolist())


    le = LabelEncoder()
    y_train = to_categorical(le.fit_transform(y_train))
    y_test = to_categorical(le.fit_transform(y_test))

    print(X_train)
    X_train = X_train.reshape(X_train.shape[0], 40, 174, 1)
    X_test = X_test.reshape(X_test.shape[0], 40, 174, 1)

    model = cnn()

    model.fit(X_train, y_train, epochs=50, batch_size=256, verbose=1)

    predictions = model.predict(X_test)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(y_test, axis=1)
    fold_accuracy = accuracy_score(true_classes, predicted_classes)
    print(f"Accuracy for fold {i}: {fold_accuracy}")
    fold_accuracies.append(fold_accuracy)

print(f"10-fold cross validation accuracy: {np.mean(fold_accuracies)}")
"""

Processing fold 1
['[[-335.0899    -176.70966    -98.33869   ...    0.           0.\n     0.       ]\n [ 123.720276    93.342224    73.104996  ...    0.           0.\n     0.       ]\n [-107.11308   -130.0589    -140.83406   ...    0.           0.\n     0.       ]\n ...\n [  -0.7318518   -3.3524568    1.4595318 ...    0.           0.\n     0.       ]\n [  -1.2285595   -6.2156386   -3.4106543 ...    0.           0.\n     0.       ]\n [   1.3734672    0.8792412   -2.305119  ...    0.           0.\n     0.       ]]'
 '[[-4.9213760e+02 -4.5506793e+02 -4.4995038e+02 ... -4.2620395e+02\n  -4.3866394e+02  0.0000000e+00]\n [ 9.9069496e+01  1.0991300e+02  1.1022446e+02 ...  8.2632080e+01\n   7.8532516e+01  0.0000000e+00]\n [-2.5339600e+01 -2.3277763e+01 -2.6389969e+01 ... -3.6936787e+01\n  -3.5359982e+01  0.0000000e+00]\n ...\n [ 3.0628052e+00  1.4575152e+00  2.5828087e+00 ... -9.7630253e+00\n  -4.8640299e+00  0.0000000e+00]\n [ 1.4688897e+00  1.4382362e-02  1.7576518e+00 ...  1.6039455e+01\n  

ValueError: cannot reshape array of size 7079 into shape (7079,40,174,1)