In [80]:
import glob
import librosa
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import LeaveOneOut, train_test_split, KFold
from sklearn.svm import SVC

In [81]:
def extract_mfcc(files, n_mfccs):
    # Dictionary to store mfcc values
    class_dict = {}
    for i in range(n_mfccs):
        name = "mfcc_" + str(i+1)
        class_dict[name] = []
    
    # Loop through all the files for the class
    for filename in files:
        # Load audio file using librosa
        x, sr = librosa.load(filename)
        # Extract mfccs
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=n_mfccs)
        # Compute mean of the MFCC features along the time axis
        mfccs_mean = np.mean(mfccs.T, axis=0)
        # Turning results into dataframe
        for index, val in enumerate(mfccs_mean):
            col = "mfcc_" + str(index+1)
            class_dict[col].append(val)

    return pd.DataFrame(class_dict)

In [82]:
def run_models(df):

    # Prep features and target
    features = df.drop(columns=["Class"])
    target = df.Class
    cv = LeaveOneOut()

    # SVM
    svm = SVC(kernel="linear", gamma=0.001)
    actual_class = []
    predicted_class = []
    
    for train_index, test_index in cv.split(features):

        # Splitting into training and testing
        X_train, X_test = features.take(train_index), features.take(test_index)
        y_train, y_test = target[train_index], target[test_index]

        svm = svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)

        actual_class.append(y_test.item())
        predicted_class.append(y_pred[0])

    print(f"SVM Accuracy: {accuracy_score(actual_class, predicted_class)*100:.2f}%")

    # svm = SVC(kernel="linear")
    # X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
    # svm.fit(X_train, y_train)
    # y_pred = svm.predict(X_test)
    # print(f"SVM Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

    # Random Forest
    rf = RandomForestClassifier()
    actual_class = []
    predicted_class = []
    
    for train_index, test_index in cv.split(features):
        
        # Splitting into training and testing
        X_train, X_test = features.take(train_index), features.take(test_index)
        y_train, y_test = target[train_index], target[test_index]

        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        actual_class.append(y_test.item())
        predicted_class.append(y_pred[0])

    print(f"RF Accuracy: {accuracy_score(actual_class, predicted_class)*100:.2f}%")

    # # XGBoost 
    # xgb = GradientBoostingClassifier()
    # actual_class = []
    # predicted_class = []
    
    # for train_index, test_index in cv.split(features):
        
    #     # Splitting into training and testing
    #     X_train, X_test = features.take(train_index), features.take(test_index)
    #     y_train, y_test = target[train_index], target[test_index]

    #     xgb.fit(X_train, y_train)
    #     y_pred = xgb.predict(X_test)

    #     actual_class.append(y_test.item())
    #     predicted_class.append(y_pred[0])

    # print(f"XGBoost Accuracy: {accuracy_score(actual_class, predicted_class)*100:.2f}%")

In [88]:
# Locate all audio files from new testing samples folder
all_audio_files = glob.glob("../microphone-sampling/TestingSamples2BatterySetup/*/*.WAV")

# Dictionary to store file names by class
class_dict = {}
for file in all_audio_files:
    file_class = file.split('/')[3]
    if file_class not in class_dict:
        class_dict[file_class] = []
    class_dict[file_class].append(file)

# Extracting MFFCs for each class, testing different # of MFFCs, and storing results in dataframe
potential_num_mfccs = np.arange(10, 15)
for n_mfcc in potential_num_mfccs:
    all_class_dfs_list = []
    for key in class_dict:
        df = extract_mfcc(class_dict[key], n_mfcc)
        df["Class"] = key
        all_class_dfs_list.append(df)

    all_classes_df = pd.concat(all_class_dfs_list, ignore_index=True)
    print(f'# MFCCs = {n_mfcc}')
    run_models(all_classes_df)
    print()

# MFCCs = 10
SVM Accuracy: 89.62%
RF Accuracy: 91.51%

# MFCCs = 11
SVM Accuracy: 89.62%
RF Accuracy: 91.51%

# MFCCs = 12
SVM Accuracy: 89.62%
RF Accuracy: 94.34%

# MFCCs = 13
SVM Accuracy: 89.62%
RF Accuracy: 94.34%

# MFCCs = 14
SVM Accuracy: 89.62%
RF Accuracy: 95.28%

