In [2]:
# imports
import librosa
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.utils import to_categorical
from sklearn.metrics import classification_report

In [89]:
# feature extraction
df = pd.read_csv('ESC-50-master/meta/esc50.csv')
def extract_mfcc(file):
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    #audio = random_segment(audio, sample_rate,2)
    centered_waveform = audio - np.mean(audio)
    normalized_waveform = centered_waveform / np.std(centered_waveform)
    if not np.isfinite(normalized_waveform).all():
        normalized_waveform = np.nan_to_num(normalized_waveform) 
    mfccs = librosa.feature.mfcc(y=normalized_waveform, sr=sample_rate, n_mfcc=40)
    return mfccs

def extract_melspectrogram(file):
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
    #audio = random_segment(audio, sample_rate,2)
    centered_waveform = audio - np.mean(audio)
    normalized_waveform = centered_waveform / np.std(centered_waveform)
    if not np.isfinite(normalized_waveform).all():
        normalized_waveform = np.nan_to_num(normalized_waveform) 
    melspectrogram = librosa.feature.melspectrogram(y=normalized_waveform, sr=sample_rate)
    return melspectrogram

def random_segment(waveform,sr, duration):
    max_start_time = len(waveform) - sr * duration
    start_time = np.random.uniform(0, max_start_time)
    end_time = start_time + sr * duration
    segment = waveform[int(start_time):int(end_time)]
    return segment

def random_sample(waveform, sr, duration, threshold=0.01):
    non_silent_intervals = librosa.effects.split(waveform, top_db=threshold)
    sample_length = sr * duration
    
    if len(non_silent_intervals) == 0 or non_silent_intervals[-1][1] < sample_length:
        max_start_idx = len(waveform) - sample_length
        start_idx = np.random.randint(0, max_start_idx if max_start_idx > 0 else 1)
    else:
        longest_interval = max(non_silent_intervals, key=lambda interval: interval[1] - interval[0])
        interval_length = longest_interval[1] - longest_interval[0]

        if interval_length >= sample_length:
            max_start_idx = longest_interval[1] - sample_length
            start_idx = np.random.randint(longest_interval[0], max_start_idx)
        else:
            start_idx = longest_interval[0]

    end_idx = start_idx + sample_length
    segment = np.concatenate([waveform[start_idx:end_idx], np.zeros(max(0, sample_length - len(waveform[start_idx:end_idx])))])

    return segment

for index, row in df.iterrows():
    file = 'ESC-50-master/audio/' + row['filename']
    mfccs = extract_mfcc(file)
    np.save(file.replace('.wav', '_preprocess_mfcc.npy'), mfccs)
    
    melspectrogram = extract_melspectrogram(file)
    np.save(file.replace('.wav', '_preprocess_melspectrogram.npy'), melspectrogram)

In [53]:
# read features
df = pd.read_csv('ESC-50-master/meta/esc50.csv')

df['mfccs'] = df['filename'].apply(lambda file: np.load('ESC-50-master/audio/' + file.replace('.wav', '_mfcc.npy')))
df['melspectrogram'] = df['filename'].apply(lambda file: np.load('ESC-50-master/audio/' +  file.replace('.wav', '_melspectrogram.npy')))

X_train, X_test, y_train, y_test = train_test_split(df[['mfccs', 'melspectrogram']], to_categorical(df['target']), test_size=0.2)

In [3]:
# read pre-processed features
df = pd.read_csv('ESC-50-master/meta/esc50.csv')

df['mfccs'] = df['filename'].apply(lambda file: np.load('ESC-50-master/audio/' + file.replace('.wav', '_preprocess_mfcc.npy')))
df['melspectrogram'] = df['filename'].apply(lambda file: np.load('ESC-50-master/audio/' +  file.replace('.wav', '_preprocess_melspectrogram.npy')))

X_train, X_test, y_train, y_test = train_test_split(df[['mfccs', 'melspectrogram']], to_categorical(df['target']), test_size=0.2)

In [None]:
def mixup_data(x, y, alpha=0.2):
    batch_size = x.shape[0]
    lam = np.random.beta(alpha, alpha, batch_size)
    index = np.random.permutation(batch_size)

    mixed_x = lam.reshape(batch_size, 1, 1, 1) * x + (1 - lam).reshape(batch_size, 1, 1, 1) * x[index, :]
    mixed_y = lam.reshape(batch_size, 1) * y + (1 - lam).reshape(batch_size, 1) * y[index, :]

    return mixed_x, mixed_y

x_train_mixed, y_train_mixed = mixup_data(x_train, y_train, alpha=0.2)

In [None]:
# mfcc visulisation
import matplotlib.pyplot as plt
import librosa
import librosa.display
df = pd.read_csv('ESC-50-master/meta/esc50.csv')

unique_classes = df['category'].unique()

fig, axs = plt.subplots(10, 5, figsize=(15, 30))  # adjust this to display 50 images in a manner you find suitable
axs = axs.flatten()

for i, category in enumerate(unique_classes):
    sample_file = df[df['category'] == category].iloc[0]['filename']
    file_path = f'ESC-50-master/audio/{sample_file}'
    
    y, sr = librosa.load(file_path)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    
    librosa.display.specshow(mfccs, sr=sr, x_axis='time', ax=axs[i])
    axs[i].set_title(category)

plt.tight_layout()
plt.show()


In [None]:
# mel visulisation
unique_classes = df['category'].unique()

fig, axs = plt.subplots(10, 5, figsize=(15, 30)) 
axs = axs.flatten()

for i, category in enumerate(unique_classes):
    sample_file = df[df['category'] == category].iloc[0]['filename']
    file_path = f'ESC-50-master/audio/{sample_file}'
    
    y, sr = librosa.load(file_path)
    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr)
    
    log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
    
    librosa.display.specshow(log_mel_spect, sr=sr, x_axis='time', y_axis='mel', ax=axs[i])
    axs[i].set_title(category)

plt.tight_layout()
plt.show()


In [127]:
for i in X_train['melspectrogram'].tolist():
    print(np.array(i).shape)

(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)
(128, 216)

In [11]:
from keras.optimizers import Adam

learning_rate = 0.001
batch_size = 32
num_epochs = 20
optimizer = Adam(learning_rate=learning_rate)

In [None]:
# base mfcc
# structure inspired from https://github.com/karolpiczak/paper-2015-esc-convnet/tree/master
from keras import models, layers

INPUTSHAPE = X_train['mfccs'].iloc[0].shape[0], X_train['melspectrogram'].iloc[0].shape[1], 1
filter_count = 64
class_count = 50
model1 = models.Sequential([
    layers.Conv2D(filter_count, kernel_size=(3, 3), activation='relu', input_shape=INPUTSHAPE, padding='valid'),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),
    layers.Conv2D(filter_count, kernel_size=(3, 3), activation='relu', padding='valid'),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'),
    layers.Flatten(),
    layers.Dense(5000, activation='relu'),
    layers.Dense(5000, activation='relu'),
    layers.Dense(class_count, activation='softmax')
])

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model1.summary()
model1.fit(np.array([i[..., np.newaxis] for i in X_train['mfccs'].tolist()]), y_train, epochs=10, batch_size=32)

X_test_reshaped = np.array([i[..., np.newaxis] for i in X_test['mfccs'].tolist()])
y_pred = model1.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(classification_report(y_true_classes, y_pred_classes))
model1.save('model1.h5')

In [96]:
#mfcc structure from https://www.kaggle.com/code/kalibrahim/audio-processing-features-cnn-training
from keras import models, layers
from keras.callbacks import EarlyStopping

input1 = Input(shape=(X_train['mfccs'].iloc[0].shape[0], X_train['mfccs'].iloc[0].shape[1], 1))

mfcc_model = models.Sequential([
    layers.Conv2D(32 , (3,3),activation = 'relu',padding='valid', input_shape = (X_train['mfccs'].iloc[0].shape[0], X_train['mfccs'].iloc[0].shape[1], 1)),  
    layers.MaxPooling2D(2, padding='same'),
    layers.Conv2D(128, (3,3), activation='relu',padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.Conv2D(128, (3,3), activation='relu',padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.GlobalAveragePooling2D(),
    layers.Dense(512 , activation = 'relu'),
    layers.Dense(50 , activation = 'softmax')
])

mfcc_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

print(np.array([i[..., np.newaxis] for i in X_train['mfccs'].tolist()]).shape)
early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # stops after 3 epochs of no improvement

res = mfcc_model.fit(np.array([i[..., np.newaxis] for i in X_train['mfccs'].tolist()]), y_train, epochs=40, batch_size=8)
#res = mfcc_model.fit(np.array([i[..., np.newaxis] for i in X_train['mfccs'].tolist()]), y_train, epochs=40, batch_size=8, validation_split=0.2, callbacks=[early_stopping])


X_test_reshaped = np.array([i[..., np.newaxis] for i in X_test['mfccs'].tolist()])
y_pred = mfcc_model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(classification_report(y_true_classes, y_pred_classes))
mfcc_model.save('mfcc_model.h5')


(1600, 40, 216, 1)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        11
           1       0.75      0.75      0.75         8
           2       0.00      0.00      0.00         2
           3       0.80      0.67      0.73         6
           4       0.62      0.71      0.67         7
           5       0.64      0.64      0.64        14
           6       0.67      0.50      0.57         8
           7       0.75      0.43      0.55         7
           8       0.54      0.70      0.61        10
           9       0.72      0.93      0.81        14
          10       0.60      0.75  

In [97]:
mfcc_model.summary()


Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_103 (Conv2D)         (None, 38, 214, 32)       320       
                                                                 
 max_pooling2d_82 (MaxPoolin  (None, 19, 107, 32)      0         
 g2D)                                                            
                                                                 
 conv2d_104 (Conv2D)         (None, 17, 105, 128)      36992     
                                                                 
 max_pooling2d_83 (MaxPoolin  (None, 9, 53, 128)       0         
 g2D)                                                            
                                                                 
 dropout_48 (Dropout)        (None, 9, 53, 128)        0         
                                                                 
 conv2d_105 (Conv2D)         (None, 7, 51, 128)      

In [None]:
#mel
from keras import models, layers
from keras.callbacks import EarlyStopping


mels_model = models.Sequential([
    layers.Conv2D(32 , (3,3),activation = 'relu',padding='valid', input_shape = (X_train['melspectrogram'].iloc[0].shape[0], X_train['melspectrogram'].iloc[0].shape[1], 1)),  
    layers.MaxPooling2D(2, padding='same'),
    layers.Conv2D(128, (3,3), activation='relu',padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.Conv2D(128, (3,3), activation='relu',padding='valid'),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.GlobalAveragePooling2D(),
    layers.Dense(512 , activation = 'relu'),
    layers.Dense(50 , activation = 'softmax')
])

mels_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

print(np.array([i[..., np.newaxis] for i in X_train['melspectrogram'].tolist()]).shape)
early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # stops after 3 epochs of no improvement

res = mels_model.fit(np.array([i[..., np.newaxis] for i in X_train['melspectrogram'].tolist()]), y_train, epochs=40, batch_size=8)
#res = mfcc_model.fit(np.array([i[..., np.newaxis] for i in X_train['mfccs'].tolist()]), y_train, epochs=40, batch_size=8, validation_split=0.2, callbacks=[early_stopping])


X_test_reshaped = np.array([i[..., np.newaxis] for i in X_test['melspectrogram'].tolist()])
y_pred = mels_model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(classification_report(y_true_classes, y_pred_classes))
mfcc_model.save('mels_model.h5')


In [19]:
from keras.layers import Conv2D, MaxPooling2D, UpSampling2D

# assume X_train_mel and X_test_mel are your Mel spectrogram features for training and testing

scaler = MinMaxScaler(feature_range=(0,1))

# reshape data for scaling
X_train_mel_reshaped = X_train_mel.reshape((-1, X_train_mel.shape[-2]*X_train_mel.shape[-1]))
X_test_mel_reshaped = X_test_mel.reshape((-1, X_test_mel.shape[-2]*X_test_mel.shape[-1]))

# apply MinMaxScaler
X_train_mel_scaled = scaler.fit_transform(X_train_mel_reshaped)
X_test_mel_scaled = scaler.transform(X_test_mel_reshaped)

# reshape data back to original shape
X_train_mel_scaled = X_train_mel_scaled.reshape((-1, X_train_mel.shape[1], X_train_mel.shape[2], 1))
X_test_mel_scaled = X_test_mel_scaled.reshape((-1, X_test_mel.shape[1], X_test_mel.shape[2], 1))

# define the convolutional autoencoder
input_img = Input(shape=(X_train_mel_scaled.shape[1], X_train_mel_scaled.shape[2], 1))

x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

x = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# train the autoencoder
autoencoder.fit(X_train_mel_scaled, X_train_mel_scaled, epochs=50, batch_size=256, shuffle=True, validation_data=(X_test_mel_scaled, X_test_mel_scaled))

# use the encoder part of the autoencoder to reduce the dimension of Mel spectrogram
encoder = Model(input_img, encoded)
X_train_mel_encoded = encoder.predict(X_train_mel_scaled)
X_test_mel_encoded = encoder.predict(X_test_mel_scaled)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [32]:
print(X_train['mfccs'].shape)
print(X_train_mel_encoded.shape)
print(X_train['mfccs'].iloc[0].shape)
    
print(X_train_mel_encoded.shape)


(1600,)
(1600, 32, 54, 32)
(40, 216)
(1600, 32, 54, 32)


In [33]:
    from keras.models import Model
    from keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Concatenate

    # mfcc input
    mfcc_input = Input(shape=(X_train['mfccs'].iloc[0].shape[0], X_train['mfccs'].iloc[0].shape[1], 1))

    mfcc_x = Conv2D(32 , (3,3),activation = 'relu',padding='valid')(mfcc_input)  
    mfcc_x = MaxPooling2D(2, padding='same')(mfcc_x)
    mfcc_x = Conv2D(128, (3,3), activation='relu',padding='valid')(mfcc_x)
    mfcc_x = MaxPooling2D(2, padding='same')(mfcc_x)
    mfcc_x = Dropout(0.3)(mfcc_x)
    mfcc_x = Conv2D(128, (3,3), activation='relu',padding='valid')(mfcc_x)
    mfcc_x = MaxPooling2D(2, padding='same')(mfcc_x)
    mfcc_x = Dropout(0.3)(mfcc_x)
    mfcc_output = GlobalAveragePooling2D()(mfcc_x)

    # melspectrogram input
    mel_input = Input(shape=X_train_mel_encoded.shape[1:])
    mel_x = Flatten()(mel_input)
    mel_output = Dense(128, activation='relu')(mel_x)

    combined = Concatenate()([mfcc_output, mel_output])
    fc = Dense(512 , activation = 'relu')(combined)

    output = Dense(50 , activation = 'softmax')(fc)

    multi_input_model = Model(inputs=[mfcc_input, mel_input], outputs=output)

    #multi_input_model.fit([X_train['mfccs'], X_train_mel_encoded], y_train, epochs=40, batch_size=8)
    multi_input_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])
    X_train_mfccs = np.array(X_train['mfccs'].tolist())
    multi_input_model.fit([X_train_mfccs, X_train_mel_encoded], y_train, epochs=40, batch_size=8)

    X_test_mel_encoded = encoder.predict(X_test_mel_scaled)
    y_pred = multi_input_model.predict([X_test['mfccs'], X_test_mel_encoded])


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [39]:

from sklearn.metrics import classification_report

X_test_mfccs = np.array(X_test['mfccs'].tolist())
y_pred = multi_input_model.predict([X_test_mfccs, X_test_mel_encoded])
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
print(classification_report(y_test_labels, y_pred_labels))



              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       1.00      0.50      0.67        12
           2       0.78      0.88      0.82         8
           3       0.86      0.67      0.75         9
           4       0.50      0.40      0.44        10
           5       0.73      1.00      0.84         8
           6       0.67      0.80      0.73         5
           7       0.19      0.75      0.30         4
           8       0.78      0.78      0.78         9
           9       0.71      0.83      0.77         6
          10       0.50      1.00      0.67         5
          11       0.88      0.64      0.74        11
          12       0.88      0.64      0.74        11
          13       0.60      0.43      0.50         7
          14       0.62      0.80      0.70        10
          15       0.57      0.50      0.53         8
          16       0.46      0.67      0.55         9
          17       0.89    

NameError: name 'plot_model' is not defined

In [45]:
from keras.utils.vis_utils import plot_model
multi_input_model.summary()
multi_input_model.save('multi_input_model.h5')
plot_model(multi_input_model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 40, 216, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_61 (Conv2D)             (None, 38, 214, 32)  320         ['input_31[0][0]']               
                                                                                                  
 max_pooling2d_52 (MaxPooling2D  (None, 19, 107, 32)  0          ['conv2d_61[0][0]']              
 )                                                                                                
                                                                                           