In [28]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
from tensorflow.keras import layers, models
import tensorflow as tf
import matplotlib.pyplot as plt
import keras_tuner as kt
import librosa
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2

In [2]:
TRAFFIC_NOISES_FILENAME = './traffic_noises_mels.npy'
CAR_HORN_MELS_FILENAME = './car_horn_mels.npy'

In [22]:
datas = {
    "mapping" : [],
    "labels" : [],
    "mels" : []
}

datas["mapping"].append("car_horn")
datas["mapping"].append("traffic_noise")

load_car_horn_mels = np.load(CAR_HORN_MELS_FILENAME)
load_traffic_noises = np.load(TRAFFIC_NOISES_FILENAME)

for mels in load_car_horn_mels:
    datas["mels"].append(mels)

for index,mels in enumerate(load_traffic_noises):
    # combined_mel = (mels + datas["mels"][index]) / 2
    combined_mel = mels
    datas["labels"].append(combined_mel)


In [23]:
# # convert lists to numpy arrays
y = np.array(datas["mels"])
X = np.array(datas["labels"])

# Create training datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [27]:
print(X.shape)
print(y.shape)
print(np.isnan(X_train).sum())  # Should return 0 if no NaNs
print(np.min(X_train))
print(np.min(y_train))
print(np.max(X_train))
print(np.max(y_train))

(40, 128, 128, 1)
(40, 128, 128, 1)
0
-80.0
-80.0
-0.00643379
-0.00037137142


In [30]:
def unet_model(input_shape, dropout_rate=0.3):
    inputs = layers.Input(shape=input_shape)

    # Encoder
    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    c1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c1)
    p1 = layers.MaxPooling2D((2, 2))(c1)
    p1 = layers.Dropout(dropout_rate)(p1)  # Dropout after pooling


    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(p1)
    c2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(c2)
    p2 = layers.MaxPooling2D((2, 2))(c2)
    p2 = layers.Dropout(dropout_rate)(p2)  # Dropout after pooling


    c3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(p2)
    c3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(c3)
    p3 = layers.MaxPooling2D((2, 2))(c3)
    p3 = layers.Dropout(dropout_rate)(p3)  # Dropout after pooling


    c4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(p3)
    c4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(c4)
    p4 = layers.MaxPooling2D((2, 2))(c4)
    p4 = layers.Dropout(dropout_rate)(p4)  # Dropout after pooling

    # Bottleneck
    c5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(p4)
    c5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(c5)

    # Decoder
    u6 = layers.Conv2DTranspose(512, (2, 2), strides=(2, 2), padding='same')(c5)
    u6 = layers.concatenate([u6, c4])
    c6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(u6)
    c6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(c6)

    u7 = layers.Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(c6)
    u7 = layers.concatenate([u7, c3])
    c7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(u7)
    c7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(c7)

    u8 = layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(c7)
    u8 = layers.concatenate([u8, c2])
    c8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(u8)
    c8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(c8)

    u9 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(c8)
    u9 = layers.concatenate([u9, c1])
    c9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(u9)
    c9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(c9)

    outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(c9)

    model = models.Model(inputs=[inputs], outputs=[outputs])
    return model

# Example input shape for spectrogram-like data (128x128 size with 1 channel)
# input_shape = (128, 128, 1)
input_shape = (X.shape[1], X.shape[2], X.shape[3])
model = unet_model(input_shape)
optimiser = keras.optimizers.Adam(learning_rate=0.0001, clipnorm=1.0)
model.compile(optimizer=optimiser, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 128, 128, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_95 (Conv2D)             (None, 128, 128, 64  640         ['input_7[0][0]']                
                                )                                                                 
                                                                                                  
 conv2d_96 (Conv2D)             (None, 128, 128, 64  36928       ['conv2d_95[0][0]']              
                                )                                                           

In [31]:
# Assume you have 'x_train' as input spectrograms and 'y_train' as target clean car sound spectrograms
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/50


2024-09-23 05:55:16.903056: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:1014] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel_5/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
result = model.predict(np.array(X_test[0]))

In [50]:
log_mel_spectrogram = result[0].reshape(128,128)
print(np.array(log_mel_spectrogram).shape)

# Plot the Mel-spectrogram
plt.figure(figsize=(10, 6))
librosa.display.specshow(log_mel_spectrogram, sr=16000, x_axis='time', y_axis='mel', fmax=8000)
plt.colorbar(format='%+2.0f dB')
plt.title('Log Mel-Spectrogram')
plt.tight_layout()
plt.show()

(128, 128)


IndexError: cannot do a non-empty take from an empty axes.

<Figure size 1000x600 with 0 Axes>