In [1]:
# import libraries 
import librosa, librosa.display
import numpy as np
import os
import collections
import pandas as pd
import tensorflow as tf
import gc
import keras
from tensorflow.keras import layers, models
from sklearn.model_selection import KFold
from sklearn import metrics

In [2]:
# load training audio files using librosa with a sampling rate of 16000 Hz
audio_files={}
sr=16000

def load_files(directory, audio_files, sr):
    for filename in os.listdir(directory):
        audio, sr = librosa.load(directory+'/'+filename, sr=sr)
        audio_files[filename]=audio

In [3]:
load_files('/kaggle/input/patient-train-audio-files/train_wav', audio_files, sr)

In [4]:
# sort the training audio files
audio_files = collections.OrderedDict(sorted(audio_files.items()))

In [5]:
# load and sort the training labels
labels=pd.read_csv('/kaggle/input/patient-labels/Train.csv')
labels.sort_values(labels.columns[0], axis=0, inplace=True)
labels=labels[['File_name','Label']]
labels['File_name'] = labels['File_name'].str.replace('.mp3','.wav')
labels.loc[labels['Label'] > 0, 'Label'] = 1

In [6]:
# define a function to split audio files into segments
def segment_audio(index, audio, sr, segment_duration, segmented_audio, y, labels):
    segment_length = int(segment_duration * sr)
    hop_length = segment_length//2
    for i in range(0, len(audio), hop_length):
        segmented_audio.append(audio[i:i+segment_length])
        y.append(labels.iloc[index, 1])

In [7]:
# split the audio files into 5 second segments
segmented_audio=[]
y=[]
index=0
segment_duration=5

for key in audio_files:
    segment_audio(index, audio_files[key], sr, segment_duration, segmented_audio, y, labels)
    index+=1

In [8]:
# convert the segmented audio files into mel-spectrograms and append the mel-spectrograms and corresponding labels into lists
X=[]
y_filtered=[]

for i in range(0, len(segmented_audio)):
    if len(segmented_audio[i]) < segment_duration * sr:
        continue
    else:
        mels = librosa.feature.melspectrogram(y=segmented_audio[i], sr=sr, n_fft=2048, hop_length=512)
        mels_db = librosa.power_to_db(S=mels, ref=np.max)
        X.append(mels_db)
        y_filtered.append(y[i])

X=np.asarray(X)
y_filtered=np.asarray(y_filtered).astype('float32')

In [9]:
# configure kfold cross-validation
kfold = KFold(n_splits=5, shuffle=True)

In [10]:
# define function to create, train, and evaluate model
def create_run_model(X_train, X_cv, y_train, y_cv, fold_loss, fold_accuracy, fold_precision, fold_recall, predict):
    
    # define CNN model
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 157, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.10))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # define the early stop callback function to monitor validation loss
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    
    # compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),loss='binary_crossentropy',metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')])

    # train the model
    history = model.fit(X_train, y_train, epochs=60, callbacks=[early_stop], validation_data=(X_cv, y_cv))

    # evaluate the model
    evaluation = model.evaluate(X_cv, y_cv)
    
    print()
    print(f"Evaluation Results: Validation Loss - {evaluation[0]}, Validation Accuracy - {evaluation[1]}, Validation Precision - {evaluation[2]}, Validation Recall - {evaluation[3]}")
    print()
    
    if predict==1:
        return model
    
    # append evaluation metrics in lists
    else:
        fold_loss.append(evaluation[0])
        fold_accuracy.append(evaluation[1])
        fold_precision.append(evaluation[2])
        fold_recall.append(evaluation[3])

        # delete the model after training and evaluating to reset weights
        del model
        keras.backend.clear_session()
        gc.collect()
        
        return 0

In [11]:
# initialize kfold metrics
fold = 1

fold_loss=[]
fold_accuracy=[]
fold_precision=[]
fold_recall=[]

In [12]:
# run kfold cross-validation on the training data only
for train, cv in kfold.split(X, y_filtered):
    
    X_train = X[train]
    X_cv = X[cv]
    y_train = y_filtered[train]
    y_cv = y_filtered[cv]

    print("Fold number:", fold)

    create_run_model(X_train, X_cv, y_train, y_cv, fold_loss, fold_accuracy, fold_precision, fold_recall, 0)

    fold+=1

Fold number: 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/60


I0000 00:00:1726978588.458067     138 service.cc:145] XLA service 0x78180000e400 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726978588.458125     138 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 16/331[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 11ms/step - accuracy: 0.7728 - loss: 0.5856 - precision: 0.8213 - recall: 0.9301

I0000 00:00:1726978593.010371     138 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.7846 - loss: 0.5201 - precision: 0.7967 - recall: 0.9797 - val_accuracy: 0.8044 - val_loss: 0.4504 - val_precision: 0.8052 - val_recall: 0.9981
Epoch 2/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8032 - loss: 0.4500 - precision: 0.8055 - recall: 0.9947 - val_accuracy: 0.8055 - val_loss: 0.4266 - val_precision: 0.8055 - val_recall: 0.9995
Epoch 3/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8108 - loss: 0.4136 - precision: 0.8175 - recall: 0.9832 - val_accuracy: 0.8195 - val_loss: 0.3810 - val_precision: 0.8251 - val_recall: 0.9840
Epoch 4/60
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8254 - loss: 0.3913 - precision: 0.8363 - recall: 0.9689 - val_accuracy: 0.8625 - val_loss: 0.3522 - val_precision: 0.8786 - val_recall: 0.9620
Epoch 5/60
[1m331/331[0m [3

In [13]:
# print kfold results
fold_loss=np.array(fold_loss)
fold_accuracy=np.array(fold_accuracy)
fold_precision=np.array(fold_precision)
fold_recall=np.array(fold_recall)

print(f'Average Loss: {np.mean(fold_loss)}')
print(f'Average Accuracy: {np.mean(fold_accuracy)} (+/- {np.std(fold_accuracy)})')
print(f'Average Precision: {np.mean(fold_precision)} (+/- {np.std(fold_precision)})')
print(f'Average Recall: {np.mean(fold_recall)} (+/- {np.std(fold_recall)})')

Average Loss: 0.14885483980178832
Average Accuracy: 0.9470464944839477 (+/- 0.006029010913729038)
Average Precision: 0.9633484482765198 (+/- 0.0030857301700145854)
Average Recall: 0.97052481174469 (+/- 0.0064684238830068205)


In [14]:
# load the test audio files
audio_files_test={}
sr=16000

load_files('/kaggle/input/patient-test-audio-files', audio_files_test, sr)

In [15]:
# sort the test audio files
audio_files_test = collections.OrderedDict(sorted(audio_files_test.items()))

In [16]:
# load and sort the test audio file labels
labels_test=pd.read_csv('/kaggle/input/solution/Solution.csv')
labels_test.sort_values(labels_test.columns[0], axis=0, inplace=True)
labels_test=labels_test[['Participant','Label']]
labels_test['Participant'] = labels_test['Participant']+'.wav'
labels_test.loc[labels_test['Label'] > 0, 'Label'] = 1

In [17]:
# append segmented audio files and corresponding labels to lists
segmented_audio_test=[]
y_test=[]
index=0

for key in audio_files_test:
    segment_audio(index, audio_files_test[key], sr, segment_duration, segmented_audio_test, y_test, labels_test)
    index+=1

In [18]:
# convert the segmented audio files into mel-spectrograms and append the mel-spectrograms and corresponding labels into lists 
X_test=[]
y_filtered_test=[]

for i in range(0, len(segmented_audio_test)):
    if len(segmented_audio_test[i]) < segment_duration * sr:
        continue
    else:
        mels = librosa.feature.melspectrogram(y=segmented_audio_test[i], sr=sr, n_fft=2048, hop_length=512)
        mels_db = librosa.power_to_db(S=mels, ref=np.max)
        X_test.append(mels_db)
        y_filtered_test.append(y_test[i])

y_filtered_test=np.asarray(y_filtered_test).astype('float32')
X_test=np.asarray(X_test)

X_train=X
y_train=y_filtered

In [19]:
# train and evaluate the model on the full training and testing dataset 
model = create_run_model(X_train, X_test, y_train, y_filtered_test, fold_loss, fold_accuracy, fold_precision, fold_recall, 1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/60
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.7546 - loss: 0.6829 - precision: 0.7969 - recall: 0.9239 - val_accuracy: 0.8032 - val_loss: 0.4464 - val_precision: 0.8034 - val_recall: 0.9996
Epoch 2/60
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8005 - loss: 0.4459 - precision: 0.8039 - recall: 0.9916 - val_accuracy: 0.7935 - val_loss: 0.4303 - val_precision: 0.8075 - val_recall: 0.9755
Epoch 3/60
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8126 - loss: 0.4127 - precision: 0.8231 - recall: 0.9751 - val_accuracy: 0.8076 - val_loss: 0.4317 - val_precision: 0.8353 - val_recall: 0.9473
Epoch 4/60
[1m414/414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8345 - loss: 0.3724 - precision: 0.8471 - recall: 0.9679 - val_accuracy: 0.7926 - val_loss: 0.4464 - val_precision: 0.8122 - val_recall: 0.9649
Epoch 5/60
[1m414/

In [20]:
# define the function to conduct prediction aggregation
def prediction_aggregation(model, directory, segment_duration, sr):
    segment_length = int(segment_duration * sr)
    hop_length = segment_length//2
    final_predictions_dict={}
    
    for filename in os.listdir(directory):
        segmented_audio_test=[]
        audio, sr = librosa.load(directory+'/'+filename, sr=sr)
        for i in range(0, len(audio), hop_length):
            segmented_audio_test.append(audio[i:i+segment_length])
        
        example=[]
        for i in range(0, len(segmented_audio_test)):
            if len(segmented_audio_test[i]) < segment_duration * sr:
                continue
            else:
                mels = librosa.feature.melspectrogram(y=segmented_audio_test[i], sr=sr, n_fft=2048, hop_length=512)
                mels_db = librosa.power_to_db(S=mels, ref=np.max)
                example.append(mels_db)
        example=np.asarray(example)
        
        # predict the test examples and use 25th percentile to obtain final prediction
        prediction = model.predict(example)
        prediction=np.asarray(prediction)
        final_predictions_dict[filename]=np.percentile(prediction, 25)
    
    # sort the final predictions and change all values >=0.5 to 1 and all values <0.5 to 0
    final_predictions_dict = collections.OrderedDict(sorted(final_predictions_dict.items()))
    for key in final_predictions_dict:
        if final_predictions_dict[key]>=0.5:
            final_predictions_dict[key]=1
        else:
            final_predictions_dict[key]=0
    
    # append and return a list of the sorted final predictions
    final_predictions=[]
    
    for key in final_predictions_dict:
        final_predictions.append(final_predictions_dict[key])
    
    return final_predictions   

In [21]:
# make final predictions on the test audio files for the entire file
final_predictions = prediction_aggregation(model, '/kaggle/input/patient-test-audio-files', segment_duration, sr)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 515ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [22]:
# print out predicted labels and expected labels
actual_labels=labels_test['Label']
actual_labels=np.array(actual_labels)
final_predictions=np.array(final_predictions)

print('Predicted Labels:', final_predictions)
print('Expected Labels:', actual_labels)

Predicted Labels: [1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
Expected Labels: [1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0]


In [23]:
# print out test metrics
accuracy = metrics.accuracy_score(actual_labels, final_predictions)
precision = metrics.precision_score(actual_labels, final_predictions)
recall = metrics.recall_score(actual_labels, final_predictions)
print('Test Accuracy:', accuracy)
print('Test Precision:', precision)
print('Test Recall:', recall)

Test Accuracy: 0.9583333333333334
Test Precision: 0.9285714285714286
Test Recall: 1.0
