In [None]:
%pip install flask tensorflow resampy librosa opencv-python pydub matplotlib

In [None]:
import tensorflow as tf

In [None]:
# Load the trained model
model = tf.keras.models.load_model('./model/cnn_very_old.keras')


In [None]:
model.summary()


In [None]:
from numpy import ndarray
import numpy as np
import librosa
import matplotlib.pyplot as plt
import cv2 as cv
import io
import base64

HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate


def spectrogram_fixed_length(audio, rate, total_samples = 128) -> ndarray:
    spectrogram = librosa.feature.melspectrogram(
        y = audio, 
        sr=rate, 
        hop_length=HOP_LENGTH, 
        win_length=WINDOW_LENGTH,
        n_fft=512
    )

    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    spectrogram_length = spectrogram_db.shape[1]

    if spectrogram_length != total_samples:
        spectrogram_db = librosa.util.fix_length(
            spectrogram_db, size=total_samples, axis=1,
        )
    
    return spectrogram_db


def generate_spectrogram(audio, sample_rate):
    spectrogram = spectrogram_fixed_length(audio, sample_rate)
    librosa.display.specshow(spectrogram, cmap='viridis')
    plt.axis('tight')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig("uploads/spectrogram.png", bbox_inches="tight", pad_inches=0.0)



# TODO: use sliding window
def preprocess_audio(audio):
    DURATION = 3
    audio, sample_rate = librosa.load(audio, duration=DURATION, res_type='kaiser_fast')
    generate_spectrogram(audio, sample_rate)
   
    IMG_SIZE = 256

    img_array = cv.imread("uploads/spectrogram.png")
    new_array = cv.resize(img_array, (IMG_SIZE, IMG_SIZE))

    print("Done!")

    return np.array(new_array).reshape(-1, IMG_SIZE, IMG_SIZE, 3) / 255


In [None]:
from numpy import ndarray
import numpy as np
import librosa
import matplotlib.pyplot as plt
import cv2 as cv
import io
import base64

HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate
WINDOW_STRIDE_SECONDS = 1    # Stride for the sliding window

def spectrogram_fixed_length(audio, rate, total_samples = 128) -> ndarray:
    spectrogram = librosa.feature.melspectrogram(
        y = audio, 
        sr=rate, 
        hop_length=HOP_LENGTH, 
        win_length=WINDOW_LENGTH,
        n_fft=512,
    )

    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    spectrogram_length = spectrogram_db.shape[1]
    print(spectrogram_length)

    if spectrogram_length != total_samples:
        spectrogram_db = librosa.util.fix_length(
            spectrogram_db, size=total_samples, axis=1,
        )
    
    return spectrogram_db

def generate_spectrogram(audio, sample_rate, output_path):
    spectrogram = spectrogram_fixed_length(audio, sample_rate)
    librosa.display.specshow(spectrogram, cmap='viridis')

    plt.axis('tight')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(output_path, bbox_inches="tight", pad_inches=0.0)
    plt.close()

def preprocess_audio(audio):
    DURATION = 10  # Process 10 seconds of audio
    audio, sample_rate = librosa.load(audio, duration=DURATION, res_type='kaiser_fast')
    DURATION = int(min(len(audio) / sample_rate, DURATION))

    spectrograms = []

    for window in range(DURATION):
        output_path = f"uploads/spectrogram_{window}.png"
        generate_spectrogram(audio[window * sample_rate:], sample_rate, output_path)
        
        img_array = cv.imread(output_path)
        new_array = cv.resize(img_array, (256, 256))
        print("reshaping")
        spectrograms.append(np.array(new_array).reshape(-1, 256, 256, 3) / 255)


    # print("Done!")

    return spectrograms
    # return np.array(new_array).reshape(-1, 256, 256, 3) / 255


In [None]:
preprocess_audio("../untitled.ogg")

In [None]:
from flask import Flask, render_template, request, session

# import os
# from pydub import AudioSegment

app = Flask("Audio Classifier")
app.config['UPLOAD_FOLDER'] = 'uploads/'
app.secret_key = '123456'

LABELS = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]

@app.route('/', methods=['GET', 'POST'])
def index():
    threshold = int(session.get('threshold', 0))
    selected_categories = [int(x) for x in session.get('selected_categories', [])]


    return render_template(
        './index.html', 
        categories=LABELS,
        threshold=threshold, 
        selected_categories=selected_categories
    )


@app.route('/process_categories', methods=['POST'])
def process_categories():
    selected_categories = request.get_json().get("selected_categories", [])
    # convert them to integers
    selected_categories = [int(x) for x in selected_categories]

    threshold = request.get_json().get('threshold', 50)

    session['selected_categories'] = selected_categories
    session['threshold'] = threshold

    return {"success": True}


@app.route('/classify', methods=['POST'])
def classify():
    # Get the audio file from the request
    audio_file = request.files['audio_file']

    # Process the audio file using your trained model
    audio_data = preprocess_audio(audio_file)
    predictions = []
    for audio_datum in audio_data:
        print("predicting...")
        
        prediction = model.predict(audio_datum)
        predictions.append(prediction)

    print(predictions)

    prediction = prediction[0]
    # categories to include
    selected_categories = session.get('selected_categories', [])
    confidence_threshold = float(session.get('threshold', 0.))

    p = {}
    pred = prediction.flatten()

    print(pred)
    for (index, label) in enumerate(LABELS):
        if index not in selected_categories:
            continue

        confidence = float(pred[index] * 100.0)

        if confidence < confidence_threshold:
            continue

        p[label] = confidence

    print("labels:", p)

    print(selected_categories)
    print(confidence_threshold)


    # create a predicted label
    if len(p) == 0:
        if len(selected_categories) == 0:
            predicted = "None Found. You must select a nuisance."
        else:
            predicted = "None Found. Try lowering the detection threshold."
    else:
        predicted = max(p, key=p.get)


    # create spectrogram image
    file = open("uploads/spectrogram_0.png", 'rb')
    image_buffer = file.read()
    spectrogram = base64.b64encode(image_buffer).decode('utf-8')

    

    # mp3_bytes = io.BytesIO()
    # audio = AudioSegment.from_file(audio_file.read())
    # audio.export(mp3_bytes, format='mp3')
    # # mp3_data = mp3_bytes.getvalue()
    
    # audio_data = base64.b64encode(mp3_bytes.read()).decode('utf-8')
    
    # Return the classification result
    return {
        'prediction': predicted,
        'spectrogram': spectrogram, 
        # 'audio': audio_data,
        # 'audio_mime': audio_file.mimetype,
        'predictions': p,
    }

app.run()
