Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.fftpack import fft,dct
import librosa
from scipy import signal
import soundfile as sf
from librosa.display import specshow
import glob

Importing data

In [2]:
# # separate 

carTrain = glob.glob("cars/train/*.wav")
carTest = glob.glob("cars/test/*.wav")

tramTrain = glob.glob("trams/train/*.wav")
tramTest = glob.glob("trams/test/*.wav")

dataset=[]
labels = []


In [3]:
def importFiles(files, label):
    dataset = []
    labels = []
    for file in files:
        data,sr = librosa.load(file)
        data = librosa.effects.trim(data, top_db=20, frame_length=1024, hop_length=512)[0] 
        # print(data.size)
        # print(data.shape)# Desired length in samples
        desired_length = sr * 5
        # # Initialize a new array of zeros with the desired length
        fixed_length_data = np.zeros(desired_length)
        #  Check the length of the original data
        original_length = len(data)
        # # If original data is longer than desired length, truncate it
        # # If it is shorter, pad with zeros
        if original_length > desired_length:
            fixed_length_data = data[:desired_length]
        else:
            fixed_length_data[:original_length] = data
        # # Now use fixed_length_data as your adjusted data
        data = fixed_length_data




        labels.append(label)
        dataset.append(data)

    return dataset,labels

In [4]:
# # separate

car_dataset, car_label  = importFiles(carTrain, 0)
car_test_dataset, car_test_label = importFiles(carTest, 0)
tram_dataset, tram_label = importFiles(tramTrain, 1)
tram_test_dataset, tram_test_label = importFiles(tramTest, 1)

  data,sr = librosa.load(file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [5]:
tram_dataset_array = np.array(tram_dataset)
car_dataset_array = np.array(car_dataset)
tram_label_array = np.array(tram_label)
car_label_array = np.array(car_label)


In [6]:
dataset = np.concatenate([tram_dataset, car_dataset], axis=0)
labels = np.concatenate([tram_label, car_label], axis=0)


In [7]:
dataset_test = np.concatenate([tram_test_dataset,car_test_dataset], axis=0)
labels_test = np.concatenate([tram_test_label, car_test_label], axis=0)


Feature Extraction

In [8]:
def padding(data, target_height, target_width):
    h = data.shape[0]
    w = data.shape[1]
    
    a = max((target_height - h) // 2,0)
    aa = max(0,target_height - a - h)
    
    b = max(0,(target_width - w) // 2)
    bb = max(target_width - b - w,0)
    
    return np.pad(data, pad_width=((a, aa), (b, bb)), mode='constant')

In [29]:
fs = 44000
f2=[]

def extractFeatures(dataset, model):
    features = []
    max_size = 1000
    
    for audio in dataset:
        mfccs = librosa.feature.mfcc(y=np.asarray(audio), sr=fs, n_mfcc=50)
    
        # spectral spread
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=np.asarray(audio), sr=fs)

        # spectral energy
        spectral_centroid = librosa.feature.spectral_centroid(y=np.asarray(audio), sr=fs)

        # spectral density
        spectral_contrast = librosa.feature.spectral_contrast(y=np.asarray(audio), sr=fs)

        #  rate of sign-changes in the signal
        zerocrossing_rate = librosa.feature.zero_crossing_rate(y=np.asarray(audio))

        #  frequency below which a certain percentage of the power spectrum is concentrated
        spectral_rolloff = librosa.feature.spectral_rolloff(y=np.asarray(audio), sr=fs)
        
        # combined_features = np.hstack([np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
        #                                np.mean(spectral_bandwidth), np.std(spectral_bandwidth),
        #                                np.mean(spectral_centroid), np.std(spectral_centroid),
        #                                np.mean(spectral_contrast), np.std(spectral_contrast),
        #                                np.mean(zerocrossing_rate), np.std(zerocrossing_rate),
        #                                np.mean(spectral_rolloff), np.std(spectral_rolloff)])
        
        # combined_features_for_CNN = np.hstack([spectral_bandwidth, spectral_centroid])
        # combined_features_for_CNN2= np.hstack([zerocrossing_rate, spectral_rolloff])
        # combined_features2=np.vstack([combined_features_for_CNN,combined_features_for_CNN2])

        if model == "KNN1":
            # combined features include spectral bandwidth, spectral centroid, zero-crossing rate, spectral rolloff
            mfccs = np.reshape(mfccs, (1,-1))
            spectral_contrast = np.reshape(spectral_contrast, (1,-1))
            combined_features = np.hstack([spectral_bandwidth, spectral_centroid, zerocrossing_rate, spectral_rolloff, mfccs, spectral_contrast])
            features.append(combined_features)
        elif model == "KNN2":
            mfccs = np.reshape(mfccs, (1,-1))
            spectral_contrast = np.reshape(spectral_contrast, (1,-1))
            combined_features = np.hstack([spectral_bandwidth, spectral_centroid, spectral_rolloff, mfccs])
            features.append(combined_features)
        else:
            combined_features = np.vstack([padding(spectral_bandwidth, 1, max_size),
                                           padding(spectral_centroid, 1, max_size),
                                           padding(zerocrossing_rate, 1, max_size), 
                                           padding(spectral_rolloff, 1, max_size),
                                           padding(spectral_contrast, 1, max_size)])
            for i in range(0,mfccs.shape[0]):
                combined_features = np.append(combined_features, padding(spectral_bandwidth, 1, max_size), axis=0)
                combined_features = np.append(combined_features, padding(spectral_centroid, 1, max_size), axis=0)
                combined_features = np.append(combined_features, padding(zerocrossing_rate, 1, max_size), axis=0)
                combined_features = np.append(combined_features, padding(spectral_rolloff, 1, max_size), axis=0)
                combined_features = np.append(combined_features, padding(spectral_contrast, 1, max_size), axis=0)
            
                if combined_features.shape[0] > mfccs.shape[0]:
                    difference = combined_features.shape[0] - mfccs.shape[0]
                    combined_features = combined_features[:-difference, :]
                    break
                
                
            combined_features = np.dstack((combined_features, padding(mfccs, mfccs.shape[0], max_size)))
            
            features.append(combined_features)

    features = np.array(features)
    return features

In [30]:
features = extractFeatures(dataset, "KNN1")

In [31]:
features2 = extractFeatures(dataset,"KNN2")

In [32]:
print(len(features))
print(len(features2))

146
146


In [33]:
features_test = extractFeatures(dataset_test,"KNN1")
features_test2 = extractFeatures(dataset_test,"KNN2")
# features_test2=extractFeatures(dataset_test,"CNN")

In [174]:
# fs = 44000
# features=[]
# features2=[]
# f2=[]

# for audio in dataset:
#     mfcc = librosa.feature.mfcc(y=np.asarray(audio), sr=fs, n_mfcc=50)    
#     # spectral spread
#     spectral_bandwidth = librosa.feature.spectral_bandwidth(y=np.asarray(audio), sr=fs)

#     # spectral energy
#     spectral_centroid = librosa.feature.spectral_centroid(y=np.asarray(audio), sr=fs)

#     # spectral density
#     spectral_contrast = librosa.feature.spectral_contrast(y=np.asarray(audio), sr=fs)

#     #  rate of sign-changes in the signal
#     zerocrossing_rate = librosa.feature.zero_crossing_rate(y=np.asarray(audio))

#     #  frequency below which a certain percentage of the power spectrum is concentrated
#     spectral_rolloff = librosa.feature.spectral_rolloff(y=np.asarray(audio), sr=fs)

#     combined_features = np.hstack([np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
#                                    np.mean(spectral_bandwidth), np.std(spectral_bandwidth),
#                                    np.mean(spectral_centroid), np.std(spectral_centroid),
#                                    np.mean(spectral_contrast), np.std(spectral_contrast),
#                                    np.mean(zerocrossing_rate), np.std(zerocrossing_rate),
#                                    np.mean(spectral_rolloff), np.std(spectral_rolloff)])
    
#     combined = np.hstack([spectral_bandwidth, spectral_centroid])
#     combo= np.hstack([zerocrossing_rate, spectral_rolloff])
#     co=np.vstack([combined,combo])
#     f2.append(co)

#     # #combined = np.concatenate(mfcc,spectral_bandwidth, spectral_centroid, spectral_contrast, zerocrossing_rate, spectral_rolloff)
#     # x=spectral_bandwidth.shape

#     combined_features_2d = combined_features.reshape(1, -1)

#     # Append combined features as 2D array
#     features.append(combined_features_2d)
#     #features2.append(f2)

#     #features.append(combined_features)
#     features2.append(mfcc)


In [34]:
features = np.asarray(features)
features2 = np.asarray(features2)

In [35]:
features_test= np.asarray(features_test)
features_test2= np.asarray(features_test2)

In [36]:
print(features.shape)
print(features2.shape)

print(features_test.shape)
print(features_test2.shape)

(146, 1, 13176)
(146, 1, 11448)
(36, 1, 13176)
(36, 1, 11448)


In [37]:
features_knn = features.reshape((features.shape[0],-1))
features_knn2 = features2.reshape((features2.shape[0],-1))

print(features_knn.shape)
print(features_knn2.shape)

(146, 13176)
(146, 11448)


In [38]:
features_test_knn = features_test.reshape((features_test.shape[0],-1))
features_test_knn2 = features_test2.reshape((features_test2.shape[0],-1))

In [39]:
print(features_test_knn.shape)
print(features_test_knn2.shape)

(36, 13176)
(36, 11448)


Nearest Neighbour

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [42]:
# # first knn
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(features_knn, labels)

y_pred = knn.predict(features_test_knn)

precision = precision_score(labels_test, y_pred)
recall = recall_score(labels_test, y_pred)
accuracy = accuracy_score(labels_test, y_pred)

print(precision)
print(recall)
print(accuracy)


0.7647058823529411
0.8125
0.8055555555555556


In [43]:
# X_train2, X_test2, y_train2, y_test2 = train_test_split(features3, labels, test_size=0.2, random_state=42)

knn2 = KNeighborsClassifier(n_neighbors=1)
knn2.fit(features_knn2, labels)

y_pred2 = knn2.predict(features_test_knn2)

precision2 = precision_score(labels_test, y_pred2)
recall2 = recall_score(labels_test, y_pred2)
accuracy2 = accuracy_score(labels_test, y_pred2)

print(precision2)
print(recall2)
print(accuracy2)

0.7647058823529411
0.8125
0.8055555555555556


### CNN
#### This section is no longer used

In [267]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout

In [282]:
## original model
# input_shape = (2, 432, 1)
input_shape = (50,1000,1)

model = Sequential()
model.add(Conv2D(4, kernel_size=(2,2), activation='relu', input_shape=input_shape))
#model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_20 (Conv2D)          (None, 49, 999, 4)        20        
                                                                 
 flatten_10 (Flatten)        (None, 195804)            0         
                                                                 
 dense_32 (Dense)            (None, 16)                3132880   
                                                                 
 dense_33 (Dense)            (None, 8)                 136       
                                                                 
 dense_34 (Dense)            (None, 1)                 9         
                                                                 
Total params: 3133045 (11.95 MB)
Trainable params: 3133045 (11.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [275]:
## alternative model
input_shape = (50,1000,2)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.2))
model.add(Conv2D(64, kernel_size=(3,3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=input_shape))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_16 (Conv2D)          (None, 48, 998, 32)       608       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 24, 499, 32)       0         
 g2D)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 24, 499, 32)       0         
                                                                 
 conv2d_17 (Conv2D)          (None, 22, 497, 64)       18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 11, 248, 64)       0         
 g2D)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 11, 248, 64)     

In [283]:
X_train, X_test, y_train, y_test = train_test_split(features2, labels, test_size=0.2, random_state=42)

In [284]:
history = model.fit(x=features2,y=labels,batch_size=5,epochs=10,validation_split=0.2,shuffle=True)

Epoch 1/10


ValueError: in user code:

    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/engine/training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/smitty/miniconda3/envs/audioproc/lib/python3.8/site-packages/keras/src/engine/input_spec.py", line 280, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_15' (type Sequential).
    
    Input 0 of layer "conv2d_20" is incompatible with the layer: expected axis -1 of input shape to have value 1, but received input with shape (None, 50, 1000, 2)
    
    Call arguments received by layer 'sequential_15' (type Sequential):
      • inputs=tf.Tensor(shape=(None, 50, 1000, 2), dtype=float32)
      • training=True
      • mask=None


In [109]:
output= model.predict(features_test)



In [110]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
predictions = [1 if x > 0.5 else 0 for x in output]

accuracy2 = accuracy_score(labels_test, predictions)
precision2 = precision_score(labels_test, predictions)
recall2 = recall_score(labels_test, predictions)


Results

In [121]:
print("Nearest Neighbour:")
print("Precision:", precision)
print("Recall:", recall)

print("CNN:")
print("Accuracy:", accuracy2)
print("Precision:", precision2)
print("Recall:", recall2)

Nearest Neighbour:
Precision: 0.8461538461538461
Recall: 0.6111111111111112
CNN:
Accuracy: 0.4444444444444444
Precision: 0.4444444444444444
Recall: 1.0
