In [5]:
%pip install flask tensorflow resampy librosa opencv-python pydub matplotlib

Collecting matplotlib
  Using cached matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.57.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (8.9 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotli

In [2]:
import tensorflow as tf

2025-04-27 22:49:32.025842: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 22:49:32.026216: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-27 22:49:32.028617: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-27 22:49:32.034920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745790572.045756  153135 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745790572.04

In [3]:
# Load the trained model
model = tf.keras.models.load_model('./model/cnn.keras')


2025-04-27 22:49:36.094506: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
model.summary()


In [6]:
from numpy import ndarray
import numpy as np
import librosa
import matplotlib.pyplot as plt
import cv2 as cv
import io
import base64

HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate


def spectrogram_fixed_length(audio, rate, total_samples = 128) -> ndarray:
    spectrogram = librosa.feature.melspectrogram(
        y = audio, 
        sr=rate, 
        hop_length=HOP_LENGTH, 
        win_length=WINDOW_LENGTH,
        n_fft=512
    )

    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    spectrogram_length = spectrogram_db.shape[1]

    if spectrogram_length != total_samples:
        spectrogram_db = librosa.util.fix_length(
            spectrogram_db, size=total_samples, axis=1,

            #constant_values=(0, -80)
        )
    
    return spectrogram_db


def generate_spectrogram(audio, sample_rate):
    spectrogram = spectrogram_fixed_length(audio, sample_rate)
    librosa.display.specshow(spectrogram, cmap='viridis')
    plt.axis('tight')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig("uploads/spectrogram.png", bbox_inches="tight", pad_inches=0.0)



# TODO: use sliding window
def preprocess_audio(audio):
    DURATION = 3
    audio, sample_rate = librosa.load(audio, duration=DURATION, res_type='kaiser_fast')
    generate_spectrogram(audio, sample_rate)
   
    # fig, ax = plt.subplots(figsize=(8, 6))
    # fig.axis('tight')
    # fig.axis('off')
    # fig.tight_layout()
    # spectrogram = librosa.display.spectrogram(spectrogram)
    # plt.close(fig)

    IMG_SIZE = 256

    img_array = cv.imread("uploads/spectrogram.png")
    new_array = cv.resize(img_array, (IMG_SIZE, IMG_SIZE))

    print("Done!")

    return np.array(new_array).reshape(-1, IMG_SIZE, IMG_SIZE, 3) / 255


In [None]:
from flask import Flask, render_template, request, session

# import os
# from pydub import AudioSegment

app = Flask("Audio Classifier")
app.config['UPLOAD_FOLDER'] = 'uploads/'
app.secret_key = '123456'

LABELS = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]

@app.route('/', methods=['GET', 'POST'])
def index():
    threshold = int(session.get('threshold', 0))
    selected_categories = [int(x) for x in session.get('selected_categories', [])]


    return render_template(
        './index.html', 
        categories=LABELS,
        threshold=threshold, 
        selected_categories=selected_categories
    )


@app.route('/process_categories', methods=['POST'])
def process_categories():
    selected_categories = request.get_json().get("selected_categories", [])
    # convert them to integers
    selected_categories = [int(x) for x in selected_categories]

    threshold = request.get_json().get('threshold', 50)

    session['selected_categories'] = selected_categories
    session['threshold'] = threshold

    return {"success": True}


@app.route('/classify', methods=['POST'])
def classify():
    # Get the audio file from the request
    audio_file = request.files['audio_file']

    # Process the audio file using your trained model
    audio_data = preprocess_audio(audio_file)
    print("predicting...")
    prediction = model.predict(audio_data)

    print(prediction)

    
    # categories to include
    selected_categories = session.get('selected_categories', [])
    confidence_threshold = float(session.get('threshold', 0.))

    p = {}
    pred = prediction.flatten()

    print(pred)
    for (index, label) in enumerate(LABELS):
        if index not in selected_categories:
            continue

        confidence = float(pred[index] * 100.0)

        if confidence < confidence_threshold:
            continue

        p[label] = confidence

    print("labels:", p)

    print(selected_categories)
    print(confidence_threshold)


    # create a predicted label
    if len(p) == 0:
        if len(selected_categories) == 0:
            predicted = "None Found. You must select a nuisance."
        else:
            predicted = "None Found. Try lowering the detection threshold."
    else:
        predicted = max(p, key=p.get)


    # create spectrogram image
    file = open("uploads/spectrogram.png", 'rb')
    image_buffer = file.read()
    spectrogram = base64.b64encode(image_buffer).decode('utf-8')

    

    # mp3_bytes = io.BytesIO()
    # audio = AudioSegment.from_file(audio_file.read())
    # audio.export(mp3_bytes, format='mp3')
    # # mp3_data = mp3_bytes.getvalue()
    
    # audio_data = base64.b64encode(mp3_bytes.read()).decode('utf-8')
    
    # Return the classification result
    return {
        'prediction': predicted,
        'spectrogram': spectrogram, 
        # 'audio': audio_data,
        # 'audio_mime': audio_file.mimetype,
        'predictions': p,
    }

app.run()


 * Serving Flask app 'Audio Classifier'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [27/Apr/2025 22:50:17] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Apr/2025 22:50:17] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step


127.0.0.1 - - [27/Apr/2025 22:50:33] "POST /classify HTTP/1.1" 200 -


[[1.5943624e-02 5.0715951e-04 2.6035190e-07 1.8002161e-06 1.8001093e-02
  7.5254063e-03 1.5043658e-01 8.0353839e-06 9.8518399e-07 8.0757511e-01]]
[1.5943624e-02 5.0715951e-04 2.6035190e-07 1.8002161e-06 1.8001093e-02
 7.5254063e-03 1.5043658e-01 8.0353839e-06 9.8518399e-07 8.0757511e-01]
labels: {}
[]
0.0


127.0.0.1 - - [27/Apr/2025 22:50:37] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


127.0.0.1 - - [27/Apr/2025 22:50:38] "POST /classify HTTP/1.1" 200 -


[[1.5943624e-02 5.0715951e-04 2.6035190e-07 1.8002161e-06 1.8001093e-02
  7.5254063e-03 1.5043658e-01 8.0353839e-06 9.8518399e-07 8.0757511e-01]]
[1.5943624e-02 5.0715951e-04 2.6035190e-07 1.8002161e-06 1.8001093e-02
 7.5254063e-03 1.5043658e-01 8.0353839e-06 9.8518399e-07 8.0757511e-01]
labels: {'air_conditioner': 1.5943623781204224, 'car_horn': 0.05071594938635826, 'children_playing': 2.6035189875983633e-05, 'dog_bark': 0.00018002161232288927, 'drilling': 1.8001092672348022, 'engine_idling': 0.752540647983551, 'gun_shot': 15.043658256530762, 'jackhammer': 0.0008035383652895689, 'siren': 9.851840150076896e-05, 'street_music': 80.75750732421875}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


127.0.0.1 - - [27/Apr/2025 22:50:50] "POST /classify HTTP/1.1" 200 -


[[8.96489099e-02 1.07952874e-04 2.14714433e-08 8.05605964e-08
  3.35717021e-04 2.42355213e-01 4.73787168e-07 1.20922588e-04
  7.04253864e-08 6.67430580e-01]]
[8.96489099e-02 1.07952874e-04 2.14714433e-08 8.05605964e-08
 3.35717021e-04 2.42355213e-01 4.73787168e-07 1.20922588e-04
 7.04253864e-08 6.67430580e-01]
labels: {'air_conditioner': 8.96489143371582, 'car_horn': 0.010795287787914276, 'children_playing': 2.147144414266222e-06, 'dog_bark': 8.056059414229821e-06, 'drilling': 0.03357170149683952, 'engine_idling': 24.23552131652832, 'gun_shot': 4.7378714953083545e-05, 'jackhammer': 0.012092258781194687, 'siren': 7.042538527457509e-06, 'street_music': 66.74305725097656}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


127.0.0.1 - - [27/Apr/2025 22:51:00] "POST /classify HTTP/1.1" 200 -


[[1.54124852e-18 1.00000000e+00 1.54039645e-33 7.52226459e-32
  2.56430432e-10 2.04958945e-16 2.09292188e-20 1.71366653e-23
  1.17039766e-33 2.90045787e-12]]
[1.54124852e-18 1.00000000e+00 1.54039645e-33 7.52226459e-32
 2.56430432e-10 2.04958945e-16 2.09292188e-20 1.71366653e-23
 1.17039766e-33 2.90045787e-12]
labels: {'air_conditioner': 1.541248577678614e-16, 'car_horn': 100.0, 'children_playing': 1.5403964793796856e-31, 'dog_bark': 7.522264827183131e-30, 'drilling': 2.5643043244372166e-08, 'engine_idling': 2.049589508800343e-14, 'gun_shot': 2.0929218393288606e-18, 'jackhammer': 1.7136666015137696e-21, 'siren': 1.1703976424159922e-31, 'street_music': 2.9004579293889776e-10}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


127.0.0.1 - - [27/Apr/2025 22:51:12] "POST /classify HTTP/1.1" 200 -


[[2.8470708e-09 3.5641133e-05 9.3421803e-18 1.2503124e-17 4.9738896e-06
  2.1669535e-09 5.4061126e-12 3.1690072e-06 2.2689687e-17 9.9995613e-01]]
[2.8470708e-09 3.5641133e-05 9.3421803e-18 1.2503124e-17 4.9738896e-06
 2.1669535e-09 5.4061126e-12 3.1690072e-06 2.2689687e-17 9.9995613e-01]
labels: {'air_conditioner': 2.8470708457462024e-07, 'car_horn': 0.003564113285392523, 'children_playing': 9.342180485151855e-16, 'dog_bark': 1.2503123214638876e-15, 'drilling': 0.0004973889444954693, 'engine_idling': 2.1669535499313497e-07, 'gun_shot': 5.406112779660077e-10, 'jackhammer': 0.0003169007250107825, 'siren': 2.2689687551502493e-15, 'street_music': 99.99561309814453}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


127.0.0.1 - - [27/Apr/2025 22:51:22] "POST /classify HTTP/1.1" 200 -


[[4.2240527e-02 1.1058249e-04 3.2128757e-08 1.9111523e-07 2.4907875e-03
  1.9736134e-03 3.4965470e-01 2.0541845e-06 1.0297865e-07 6.0352737e-01]]
[4.2240527e-02 1.1058249e-04 3.2128757e-08 1.9111523e-07 2.4907875e-03
 1.9736134e-03 3.4965470e-01 2.0541845e-06 1.0297865e-07 6.0352737e-01]
labels: {'air_conditioner': 4.224052429199219, 'car_horn': 0.01105824951082468, 'children_playing': 3.212875753888511e-06, 'dog_bark': 1.9111523215542547e-05, 'drilling': 0.24907875061035156, 'engine_idling': 0.19736133515834808, 'gun_shot': 34.96546936035156, 'jackhammer': 0.00020541844423860312, 'siren': 1.0297865628672298e-05, 'street_music': 60.35273742675781}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


127.0.0.1 - - [27/Apr/2025 22:51:50] "POST /classify HTTP/1.1" 200 -


[[1.91808507e-01 4.04110360e-05 1.09364906e-10 2.69344352e-10
  4.85577111e-05 1.16679184e-02 2.27655939e-09 1.50007822e-04
  9.08670528e-10 7.96284556e-01]]
[1.91808507e-01 4.04110360e-05 1.09364906e-10 2.69344352e-10
 4.85577111e-05 1.16679184e-02 2.27655939e-09 1.50007822e-04
 9.08670528e-10 7.96284556e-01]
labels: {'air_conditioner': 19.180850982666016, 'car_horn': 0.004041103646159172, 'children_playing': 1.0936490824065004e-08, 'dog_bark': 2.6934435126690914e-08, 'drilling': 0.004855771083384752, 'engine_idling': 1.1667917966842651, 'gun_shot': 2.276559314395854e-07, 'jackhammer': 0.01500078197568655, 'siren': 9.086705432537201e-08, 'street_music': 79.62845611572266}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


127.0.0.1 - - [27/Apr/2025 22:51:59] "POST /classify HTTP/1.1" 200 -


[[7.95357525e-02 1.60415966e-05 1.04699227e-14 8.28692990e-14
  8.62345041e-06 1.03089005e-01 1.14325346e-12 2.19760279e-10
  4.13993384e-13 8.17350566e-01]]
[7.95357525e-02 1.60415966e-05 1.04699227e-14 8.28692990e-14
 8.62345041e-06 1.03089005e-01 1.14325346e-12 2.19760279e-10
 4.13993384e-13 8.17350566e-01]
labels: {'air_conditioner': 7.953575134277344, 'car_horn': 0.0016041597118601203, 'children_playing': 1.0469922455055847e-12, 'dog_bark': 8.286930169854045e-12, 'drilling': 0.0008623450412414968, 'engine_idling': 10.308900833129883, 'gun_shot': 1.1432534779975967e-10, 'jackhammer': 2.1976028108383616e-08, 'siren': 4.139933881819324e-11, 'street_music': 81.73505401611328}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.0


 * Serving Flask app 'Audio Classifier'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [02/Apr/2025 14:43:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Apr/2025 14:44:03] "GET /?audio_file=siren.wav HTTP/1.1" 200 -
127.0.0.1 - - [02/Apr/2025 14:44:33] "GET /?audio_file=silence.wav HTTP/1.1" 200 -
