In [29]:
import sounddevice as sd
from scipy.io.wavfile import write
import librosa
import numpy as np
from tensorflow.keras.models import load_model
import joblib
from sklearn.decomposition import PCA

In [2]:
def standardize_mfcc_length(mfcc, target_length=100):
    """
    Standardize a single MFCC matrix to a fixed length.
    Pads or truncates along the time axis, then flattens the result.
    
    Parameters:
        mfcc (np.ndarray): 2D MFCC array (n_mfcc, time_steps)
        target_length (int): Desired number of time steps
    
    Returns:
        np.ndarray: Flattened standardized MFCC
    """
    current_length = mfcc.shape[1]
    
    if current_length > target_length:
        # Truncate
        standardized = mfcc[:, :target_length]
    else:
        # Pad with zeros
        padding = np.zeros((mfcc.shape[0], target_length - current_length))
        standardized = np.hstack([mfcc, padding])
    
    return standardized.flatten()


In [18]:
fs = 16000  # Sample rate
seconds = 3  # Duration of recording


# PLEASE NOTE: WAIT A SECOND WHEN AFTER THE RECORDING STARTS / HAVE A SLIGHT PAUSE AT THE START.
# The recording tends to cut off the very start, and so the first second is trimmed before being inputted into the model
# To have the best prediction results



myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
print("Recording...")
sd.wait()  # Wait until recording is finished
print("Finished recording")
write('output.wav', fs, myrecording)  # Save as WAV file

Recording...
Finished recording


In [32]:
# Step 1: Load the audio
file_path = 'output.wav'
y, _ = librosa.load('output.wav', sr=16000)
y = y[16000:]  # Use only the first second (if model expects 1-sec clips)


# Step 2: Extract MFCCs
mfcc = librosa.feature.mfcc(y=y, sr=16000, n_mfcc=13)

# Step 3: Standardize MFCC length
standardized_mfcc = standardize_mfcc_length(mfcc, target_length=100)  # shape: (13, 100), then flattened
cnn_standardized_mfcc = standardize_mfcc_length(mfcc, target_length=100)  



In [33]:
# Test with Tuned Logistic Regression model

# Load the model
model = joblib.load("tuned_logistic_model.pkl")
print("Tuned Logistic Regression model loaded.")

# Reshape input if it's a single sample
standardized_mfcc = standardized_mfcc.reshape(1, -1)

# Predict the class label
predicted_label = model.predict(standardized_mfcc)
print(f"Predicted class label: {predicted_label[0]}")

Tuned Logistic Regression model loaded.
Predicted class label: down


In [34]:
# Test with SVM

# Load the model
model = joblib.load("svm_model.pkl")
print("SVM model loaded.")

# Reshape input if it's a single sample
standardized_mfcc = standardized_mfcc.reshape(1, -1)

# Predict the class label
predicted_label = model.predict(standardized_mfcc)
print(f"Predicted class label: {predicted_label[0]}")

SVM model loaded.
Predicted class label: right


In [None]:
# Test with CNN

# Load the model
model = load_model('cnn_model.h5')
print("CNN model loaded.")

# Prepare for CNN input
# Reshape: (batch, height, width, channels)
input_data = cnn_standardized_mfcc[np.newaxis, ..., np.newaxis]  

# Label list in correct encoder order
label_list = ['down', 'go', 'left', 'right', 'stop', 'up']


# Step 5: Predict
prediction = model.predict(input_data)
predicted_index = np.argmax(prediction)
predicted_label = label_list[predicted_index]

print(f"Prediction probabilities: {prediction}")
print(f"Predicted class index: {predicted_index}")
print(f"Predicted class label: {predicted_label}")

CNN model loaded.
Prediction probabilities: [[ 4582.053   -159.0854 -3739.2175 -2097.907  -1558.0814 -1358.1765]]
Predicted class index: 0
Predicted class label: down
