<h2>Data Load</h2>

In [7]:
import pandas as pd

DATASET_FILEPATH = "../_data/MATCHING_DISTANCE_VECTORS.csv"
MODELS_EXPORT_PATH = "../_models/"

df = pd.read_csv(DATASET_FILEPATH, header=0, sep=",")

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["MATCH"])
y = df["MATCH"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

print("X_train: {}".format(X_train.shape))
print("y_train: {}".format(y_train.shape))
print("X_test: {}".format(X_test.shape))
print("y_test: {}".format(y_test.shape))

X_train: (375246, 10)
y_train: (375246,)
X_test: (160821, 10)
y_test: (160821,)


<h2>Utils</h2>

<h3>Model Evaluation</h3>

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import itertools
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def plot_confusion_matrix(cm, classes):
    cm_counts = cm
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
   
    cmap=plt.cm.Blues

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, str(cm_counts[i, j]) + " (" + str(round(cm[i, j] * 100, 2)) + "%)",
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

def display_model_evaluation(y_actual, y_pred, classes):
    
    if len(y_actual) != len(y_pred):
        raise ValueError("Lengths of provided y_actual and y_pred do not match.")
    
    cm = confusion_matrix(y_actual, y_pred)
    acc_perc = round(accuracy_score(y_actual, y_pred) * 100, 2)
    #recall = recall_score(y_actual, y_pred)
    #prec = precision_score(y_actual, y_pred)
    #f1 = f1_score(y_actual, y_pred)
    
    plot_confusion_matrix(cm, classes)
    
    print("Accuracy: {}%".format(acc_perc))
    #print("Recall: {}".format(recall))
    #print("Precision: {}".format(prec))
    #print("F1: {}".format(f1))

<h3>Model Persistance</h3>

In [10]:
from joblib import dump

def save_model_to_file(clf, filename):
    dump(clf, MODELS_EXPORT_PATH + filename)

<h2>Model Training</h2>

<h3>K Nearest Neighbors</h3>

<h3>Support Vector Machines</h3>

In [None]:
from sklearn import svm

C = 450.07963080130486
gamma = 9.711499749455284e-06

svm_clf = svm.SVC(C=C, gamma=gamma, probability=True)
svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_test)

In [None]:
display_model_evaluation(y_test, y_pred, set(y_pred))

In [None]:
save_model_to_file(svm_clf, "svm_smartphones.joblib")