### train SVM
This script creates training, validation, and test dataset first. Then it uses SVM to classify the samples.

In [26]:
import numpy as np
from numpy import genfromtxt
my_data = genfromtxt('smell_dataset.csv', delimiter=',')

In [27]:
X = my_data[1:,3:14]
# X = my_data[1:,[11]]
Y = my_data[1:,14:]

In [28]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_data(X, Y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    for train_index, test_index in sss.split(X, Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        return x_train, y_train, x_test, y_test

In [29]:
from sklearn.utils import resample, shuffle


def balance_dataset(x, y):
        x_benign = x[ (y.ravel()==0)]
        x_smelly = x[ (y.ravel()==1)]
        x_downsampled = resample(x_benign, replace=False, n_samples=len(x_smelly), random_state=145)
        x_balanced = np.concatenate((x_smelly, x_downsampled))
        y_balanced = np.empty(shape=[len(x_balanced)], dtype=np.int16)
        y_balanced[0:len(x_smelly)] = 1.0
        y_balanced[len(x_smelly):] = 0.0
        x_balanced, y_balanced = shuffle(x_balanced, y_balanced)
        return x_balanced, y_balanced

def get_balanced_training_data(X, Y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    for train_index, test_index in sss.split(X, Y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        x_train, y_train = balance_dataset(x_train, y_train)
        # x_test, y_test = balance_dataset(x_test, y_test)
        return x_train, y_train, x_test, y_test

In [30]:
from sklearn.svm import SVC

x_train, y_train, x_test, y_test = get_balanced_training_data(X, Y)

classifier = SVC(kernel='rbf', random_state = 1)
classifier.fit(x_train,y_train.ravel())
y_pred = classifier.predict(x_test)

In [33]:
from sklearn.tree import DecisionTreeClassifier

x_train, y_train, x_test, y_test = get_balanced_training_data(X, Y)

clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

In [34]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
accuracy = float(cm.diagonal().sum())/len(y_test)
print("\nModel accuracy: ", accuracy)


Model accuracy:  0.9982840800762631


In [35]:
from sklearn.metrics import recall_score, precision_score

recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print (f"precision: {precision}, recall: {recall}")

precision: 0.9680715197956578, recall: 0.9973684210526316
