In [29]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

In [46]:
# Define the best results
min_acc = 0.34 # 0.8577
min_sen = 0.0 # 0.8759
min_spe = 1.0 # 0.8394

In [51]:
# Load the file_information table
table_info = pd.read_csv(Path("datasets", "file_information.xlsx"))
table_info

FileNotFoundError: [Errno 2] No such file or directory: 'datasets\\file_information.xlsx'

In [47]:
# Get all result files
results_dir = Path("results")
file_list = list(results_dir.glob("**/results.csv"))

# Prepare a table for results
table_results = pd.DataFrame(columns=["folder", "options", "acc", "sen", "spe"], data=[])

# Iterate through each result file
for file in file_list[:]:
    # UUID
    folder = str(file.parent).split('\\')[1]
    table = pd.read_csv(file)
    # Add each row to the results table
    for row_index in range(table.shape[0]):
        table_results.loc[table_results.shape[0], :] = [folder] + table.iloc[row_index, [0, 1, -2, -1]].tolist()
    
# Drop low results
for col, metric in zip(["acc", "sen", "spe"], [min_acc, min_sen, min_spe]):
    table_results.drop(table_results[table_results[col] < metric].index, inplace=True)
    
table_results

Unnamed: 0,folder,options,acc,sen,spe
3,14c71937-6673-47c1-a377-52f70c643e17,"(0.05, 20, 'rbf', 'auto', 1)",0.384615,0.074627,1.0


In [48]:
# Train classifier for each good result
for idx, (name, option) in enumerate(zip(table_results.folder, table_results.options)):
    # Dataset loading
    training_data_dir = Path("training_data").joinpath(name)
    
    train_set = pickle.load(open(training_data_dir.joinpath("train_set.pk"), "rb"))
    test_set = pickle.load(open(training_data_dir.joinpath("test_set.pk"), "rb"))
    
    input_train = np.array(train_set["data"])
    input_test = np.array(test_set["data"])
    input_total = np.concatenate((input_train, input_test), axis=0)
    
    label_train = np.array(train_set["labels"])
    label_test = np.array(test_set["labels"])
    label_total = np.array(train_set["labels"] + test_set["labels"])
    
    index_train = np.array(train_set["index"])
    index_test = np.array(test_set["index"])
    index_total = np.array(train_set["index"] + test_set["index"])
    
    # Classifier options
    c, weight, kernel, gamma, degree = option[1:-1].split(", ")
    clf = svm.SVC(kernel=kernel.replace("'", ""), gamma=gamma.replace("'",""), degree=int(degree), class_weight={0: int(weight) / 10}, C=float(c))

    # Training the classifier
    clf.fit(input_train, label_train)
    
    # # Validating the classifier (just to be sure about the reproducibility
    # y_pred = clf.predict(input_test)
    # TN, FP, FN, TP = confusion_matrix(label_test, y_pred).ravel()
    # acc = accuracy_score(label_test, y_pred)
    # sen = recall_score(label_test, y_pred)
    # spe = TN / (TN + FP)
    # print((c, weight, kernel, gamma, degree), acc, sen, spe)
    
    # Predicting ALL values to see which samples are incorrectly identified
    y_pred = clf.predict(input_total)
    # Incorrect predictions
    print(index_total[y_pred != label_total])
    
    

[2445 2098 1777 ... 1635 1993 2416]


In [27]:
from pathlib import Path
path = Path("datasets")
files = list(path.glob("**/*.wav"))
files_series = pd.Series(files)
print(files_series.shape)
files_series.unique().shape

(2041,)


(2041,)

In [21]:
import numpy as np
import pickle

train_set = pickle.load(open("train_set.pk", "rb"))
test_set = pickle.load(open("test_set.pk", "rb"))

dataset = {}
dataset["train_input"] = np.array(train_set["data"])
dataset["train_label"] = np.array(train_set["labels"])
dataset["test_input"] = np.array(test_set["data"])
dataset["test_label"] = np.array(test_set["labels"])
dataset["index"] = np.concatenate((np.array(train_set["index"]), np.array(test_set["index"])), axis=0)
dataset["input"] = np.concatenate((np.array(train_set["data"]), np.array(test_set["data"])), axis=0)
dataset["labels"] = np.concatenate((np.array(train_set["labels"]), np.array(test_set["labels"])), axis=0)
print(dataset["test_input"].shape)
np.unique(dataset["test_input"], axis=0).shape


(377, 127)


(355, 127)

In [26]:
print(len(test_set["index"]))
pd.Series(test_set["index"]).unique().shape

377


(355,)