In [1]:
%pip install flask tensorflow resampy librosa opencv-python pydub




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
import tensorflow as tf

In [3]:
# Load the trained model
model = tf.keras.models.load_model('./model/cnn.keras')

In [3]:
model.summary()

In [4]:
from numpy import ndarray
import numpy as np
import librosa
import matplotlib.pyplot as plt
import cv2 as cv
import io
import base64

HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate


def spectrogram_fixed_length(audio, rate, total_samples = 128) -> ndarray:
    spectrogram = librosa.feature.melspectrogram(
        y = audio, 
        sr=rate, 
        hop_length=HOP_LENGTH, 
        win_length=WINDOW_LENGTH,
        n_fft=512
    )

    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    spectrogram_length = spectrogram_db.shape[1]

    if spectrogram_length != total_samples:
        spectrogram_db = librosa.util.fix_length(
            spectrogram_db, size=total_samples, axis=1,

            #constant_values=(0, -80)
        )
    
    return spectrogram_db


def generate_spectrogram(audio, sample_rate):
    spectrogram = spectrogram_fixed_length(audio, sample_rate)
    librosa.display.specshow(spectrogram, cmap='viridis')
    plt.axis('tight')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig("uploads/spectrogram.png", bbox_inches="tight", pad_inches=0.0)



# TODO: use sliding window
def preprocess_audio(audio):
    DURATION = 3
    audio, sample_rate = librosa.load(audio, duration=DURATION, res_type='kaiser_fast')
    generate_spectrogram(audio, sample_rate)
   
    # fig, ax = plt.subplots(figsize=(8, 6))
    # fig.axis('tight')
    # fig.axis('off')
    # fig.tight_layout()
    # spectrogram = librosa.display.spectrogram(spectrogram)
    # plt.close(fig)

    IMG_SIZE = 256

    img_array = cv.imread("uploads/spectrogram.png")
    new_array = cv.resize(img_array, (IMG_SIZE, IMG_SIZE))

    print("Done!")

    return np.array(new_array).reshape(-1, IMG_SIZE, IMG_SIZE, 3) / 255


In [None]:
from flask import Flask, render_template, request, session

# import os
# from pydub import AudioSegment

app = Flask("Audio Classifier")
app.config['UPLOAD_FOLDER'] = 'uploads/'
app.secret_key = '123456'

LABELS = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]

@app.route('/', methods=['GET', 'POST'])
def index():
    threshold = int(session.get('threshold', 0))
    selected_categories = [int(x) for x in session.get('selected_categories', [])]


    return render_template(
        './index.html', 
        categories=LABELS,
        threshold=threshold, 
        selected_categories=selected_categories
    )


@app.route('/process_categories', methods=['POST'])
def process_categories():
    selected_categories = request.get_json().get("selected_categories", [])
    # convert them to integers
    selected_categories = [int(x) for x in selected_categories]

    threshold = request.get_json().get('threshold', 50)

    session['selected_categories'] = selected_categories
    session['threshold'] = threshold

    return {"success": True}


@app.route('/classify', methods=['POST'])
def classify():
    # Get the audio file from the request
    audio_file = request.files['audio_file']

    # Process the audio file using your trained model
    audio_data = preprocess_audio(audio_file)
    print("predicting...")
    prediction = model.predict(audio_data)

    print(prediction)

    
    # categories to include
    selected_categories = session.get('selected_categories', [])
    confidence_threshold = float(session.get('threshold', 0.))

    p = {}
    pred = prediction.flatten()

    print(pred)
    for (index, label) in enumerate(LABELS):
        if index not in selected_categories:
            continue

        confidence = float(pred[index] * 100.0)

        if confidence < confidence_threshold:
            continue

        p[label] = confidence

    print("labels:", p)

    print(selected_categories)
    print(confidence_threshold)


    # create a predicted label
    if len(p) == 0:
        if len(selected_categories) == 0:
            predicted = "None Found. You must select a nuisance."
        else:
            predicted = "None Found. Try lowering the detection threshold."
    else:
        predicted = max(p, key=p.get)



    # create spectrogram image
    file = open("uploads/spectrogram.png", 'rb')
    image_buffer = file.read()
    spectrogram = base64.b64encode(image_buffer).decode('utf-8')

    

    # mp3_bytes = io.BytesIO()
    # audio = AudioSegment.from_file(audio_file.read())
    # audio.export(mp3_bytes, format='mp3')
    # # mp3_data = mp3_bytes.getvalue()
    
    # audio_data = base64.b64encode(mp3_bytes.read()).decode('utf-8')
    
    # Return the classification result
    return {
        'prediction': predicted,
        'spectrogram': spectrogram, 
        # 'audio': audio_data,
        # 'audio_mime': audio_file.mimetype,
        'predictions': p,
    }

app.run()


 * Serving Flask app 'Audio Classifier'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [14/Apr/2025 15:28:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:20] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step


127.0.0.1 - - [14/Apr/2025 15:28:21] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
confidence 21.877476572990417
labels: {'air_conditioner': 3.5287377186943303e-09, 'car_horn': 2.938791610063163e-06, 'children_playing': 1.528650801628828, 'dog_bark': 0.10979404905810952, 'drilling': 0.01941859518410638, 'engine_idling': 5.1260150257803616e-05, 'gun_shot': 1.516048397931513e-08, 'jackhammer': 0.028667942387983203, 'siren': 21.877476572990417}
[0, 1, 2, 3, 4, 5, 6, 7, 8]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

127.0.0.1 - - [14/Apr/2025 15:28:21] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
confidence 21.877476572990417
confidence 76.43592953681946
labels: {'air_conditioner': 3.5287377186943303e-09, 'car_horn': 2.938791610063163e-06, 'children_playing': 1.528650801628828, 'dog_bark': 0.10979404905810952, 'drilling': 0.01941859518410638, 'engine_idling': 5.1260150257803616e-05, 'gun_shot': 1.516048397931513e-08, 'jackhammer': 0.028667942387983203, 'siren': 21.877476572990417, 'street_music': 76.43592953681946}
[0, 1, 2, 3, 4, 5, 6, 7, 8,

127.0.0.1 - - [14/Apr/2025 15:28:25] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:26] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:26] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


127.0.0.1 - - [14/Apr/2025 15:28:29] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
confidence 21.877476572990417
labels: {'air_conditioner': 3.5287377186943303e-09, 'car_horn': 2.938791610063163e-06, 'children_playing': 1.528650801628828, 'dog_bark': 0.10979404905810952, 'drilling': 0.01941859518410638, 'engine_idling': 5.1260150257803616e-05, 'gun_shot': 1.516048397931513e-08, 'jackhammer': 0.028667942387983203, 'siren': 21.877476572990417}
[0, 1, 2, 3, 4, 5, 6, 7, 8]
0.0
Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

127.0.0.1 - - [14/Apr/2025 15:28:34] "POST /classify HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:36] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


127.0.0.1 - - [14/Apr/2025 15:28:39] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
confidence 21.877476572990417
labels: {'siren': 21.877476572990417}
[0, 1, 2, 3, 4, 5, 6, 7, 8]
12.0


127.0.0.1 - - [14/Apr/2025 15:28:49] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:50] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:28:56] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step


127.0.0.1 - - [14/Apr/2025 15:28:57] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
labels: {}
[0, 1, 2, 3, 4, 5, 6, 7]
27.0


127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -
127.0.0.1 - - [14/Apr/2025 15:29:00] "POST /process_categories HTTP/1.1" 200 -


Done!
predicting...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


127.0.0.1 - - [14/Apr/2025 15:29:02] "POST /classify HTTP/1.1" 200 -


[[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
  5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]]
[3.5287377e-11 2.9387916e-08 1.5286508e-02 1.0979405e-03 1.9418595e-04
 5.1260150e-07 1.5160484e-10 2.8667942e-04 2.1877477e-01 7.6435930e-01]
confidence 3.5287377186943303e-09
confidence 2.938791610063163e-06
confidence 1.528650801628828
confidence 0.10979404905810952
confidence 0.01941859518410638
confidence 5.1260150257803616e-05
confidence 1.516048397931513e-08
confidence 0.028667942387983203
labels: {'air_conditioner': 3.5287377186943303e-09, 'car_horn': 2.938791610063163e-06, 'children_playing': 1.528650801628828, 'dog_bark': 0.10979404905810952, 'drilling': 0.01941859518410638, 'engine_idling': 5.1260150257803616e-05, 'gun_shot': 1.516048397931513e-08, 'jackhammer': 0.028667942387983203}
[0, 1, 2, 3, 4, 5, 6, 7]
0.0


 * Serving Flask app 'Audio Classifier'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [02/Apr/2025 14:43:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Apr/2025 14:44:03] "GET /?audio_file=siren.wav HTTP/1.1" 200 -
127.0.0.1 - - [02/Apr/2025 14:44:33] "GET /?audio_file=silence.wav HTTP/1.1" 200 -
