In [45]:
import streamlit as st
import os
import numpy as np
import librosa
import tensorflow as tf
from keras.models import load_model
import matplotlib.pyplot as plt

model = load_model('./model/audio_classifier_LSTM_model_v1.h5')

def pad_audio_if_needed(audio, n_fft):
    if len(audio) < n_fft:
        padding = n_fft - len(audio)
        audio = np.pad(audio, pad_width=(0, padding), mode='constant')
    return audio

def pad_or_truncate(array, max_length):
        if array.shape[1] < max_length:
            padding = max_length - array.shape[1]
            array = np.pad(array, pad_width=((0, 0), (0, padding)), mode='constant')
        else:
            array = array[:, :max_length]
        return array
    
def extract_features(file_path, max_time_steps=109, SAMPLE_RATE=16000, N_MELS=128):
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE)
    audio = pad_audio_if_needed(audio, n_fft=512)
    
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=SAMPLE_RATE)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=SAMPLE_RATE)
    tonnetz = librosa.feature.tonnetz(y=audio, sr=SAMPLE_RATE)

    # Pad or truncate each feature to have the same width
    mel_spectrogram = pad_or_truncate(mel_spectrogram, max_time_steps)
    mfccs = pad_or_truncate(mfccs, max_time_steps)
    chroma = pad_or_truncate(chroma, max_time_steps)
    spectral_contrast = pad_or_truncate(spectral_contrast, max_time_steps)
    tonnetz = pad_or_truncate(tonnetz, max_time_steps)
    
    features = np.concatenate((mel_spectrogram, mfccs, chroma, spectral_contrast, tonnetz), axis=0)
    return features

def classify_audio(audio_file):
    # Load audio file
    extracted_features = np.array(extract_features(file_path=audio_file))
    processed_audio = np.expand_dims(extracted_features, axis=0)
    prediction = model.predict(processed_audio)
    prediction = np.argmax(prediction, axis=1)
    if prediction == 1:
        return "Human Voice"
    else:
        return "AI Generated Voice"
    
# classify_audio('../audio-deepfake-detection-main/TestEvaluation/LA_E_4785445.flac')


# Streamlit app
st.title('Human vs AI Voice Classifier')

uploaded_file = st.file_uploader("Choose an audio file...", type=['wav', 'mp3', 'ogg', 'flac'])

if uploaded_file is not None:
    # Call the classify_audio function
    result = classify_audio(uploaded_file)
    st.write(f'The audio file is classified as: {result}')
