In [1]:
%reload_ext autoreload
%autoreload 2
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneGroupOut, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn_porter import Porter
from tqdm import tqdm

import data_handler

In [2]:
measurement_frame = data_handler.from_csv()
aggregated_frame = data_handler.aggregate(measurement_frame)
X, y = data_handler.split_x_y(aggregated_frame)

In [3]:
names = [
        "Nearest Neighbors", # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
        "RBF SVM",
        "Linear SVM", 
        "Decision Tree", 
        "Random Forest", 
        "Naive Bayes"
]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="rbf", C=1.0, gamma="auto", random_state=0),
    SVC(kernel="linear", C=1.0, random_state=0),
    DecisionTreeClassifier(max_depth=12, random_state=0),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=0),
    GaussianNB()
]

column_names = list(X.columns)
X_scaled = StandardScaler().fit_transform(X)

In [4]:
def evaluate(clfs, names, splits, X, y, silent=False):
    scores = np.zeros((len(splits), len(classifiers)))
    confusions = [None] * len(classifiers)
    
    # Iterate over every split
    for split_idx, (train_index, test_index) in tqdm(enumerate(splits), total=len(splits), disable=silent):
        y_train, y_test = y[train_index], y[test_index]
        X_train, X_test = X[train_index], X[test_index]

        
        for clf_idx, (name, clf) in enumerate(zip(names, clfs)):
            clf.fit(X_train, y_train)
            
            cm = confusion_matrix(y_test, clf.predict(X_test), ['Standing', 'Walking', 'Running'])
            
            if confusions[clf_idx] is None:
                confusions[clf_idx] = cm
            else:
                confusions[clf_idx] = confusions[clf_idx] + cm
            scores[split_idx, clf_idx] = clf.score(X_test, y_test)
    return np.mean(scores, axis=0), np.std(scores, axis=0), confusions

def printStatistics(names, mean_accuracies, std_accuracies, confusions):
    for name, mean_accuracy, std_accuracy, confusion in zip(names, mean_accuracies, std_accuracies, confusions):
        print(name)
        print('Mean accuracy:', mean_accuracy)
        print('Std accuracy:', std_accuracy)
        print('Confusion Matrix:')
        print(['Standing', 'Walking', 'Running'])
        print(confusion)
        print("\n")

### Leave one Subject out

In [5]:
# Cerate 10 splits (each containing one subject of each class)
aggregated_frame["subject"] = aggregated_frame["subject"].str.replace(r"[a-zA-Z]",'')
splits = list(LeaveOneGroupOut().split(X, y, groups=aggregated_frame["subject"]))

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_scaled, y, silent=False)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

100%|██████████| 10/10 [00:00<00:00, 16.60it/s]

Nearest Neighbors
Mean accuracy: 0.9118896913363918
Std accuracy: 0.04738953235929725
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[340  13   2]
 [ 19 331   1]
 [  9  49 302]]


RBF SVM
Mean accuracy: 0.9202036219696803
Std accuracy: 0.03901108973190615
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[337  15   3]
 [ 18 329   4]
 [  6  38 316]]


Linear SVM
Mean accuracy: 0.9351752888046345
Std accuracy: 0.0365536404501705
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[334  16   5]
 [ 16 328   7]
 [  6  18 336]]


Decision Tree
Mean accuracy: 0.8871934460248404
Std accuracy: 0.06138194667906884
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[295  25  35]
 [ 10 324  17]
 [ 10  22 328]]


Random Forest
Mean accuracy: 0.9220785345251375
Std accuracy: 0.03331148726180644
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[319  28   8]
 [ 11 333   7]
 [  5  23 332]]


Naive Bayes
Mean accuracy: 0.9362715765428877
Std accuracy: 0.038934611833233886
Confusion 




### 10-Fold Cross-Validation

In [6]:
splits = list(KFold(n_splits=10, shuffle=True).split(X, y))

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_scaled, y, silent=False)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

100%|██████████| 10/10 [00:00<00:00, 17.10it/s]

Nearest Neighbors
Mean accuracy: 0.92119555633927
Std accuracy: 0.01884289131819344
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[345   8   2]
 [ 16 334   1]
 [  9  48 303]]


RBF SVM
Mean accuracy: 0.92778169634985
Std accuracy: 0.024111727130497147
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[343   9   3]
 [ 19 329   3]
 [  6  37 317]]


Linear SVM
Mean accuracy: 0.9418092047257979
Std accuracy: 0.011861773979081029
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[339  13   3]
 [ 16 329   6]
 [  6  18 336]]


Decision Tree
Mean accuracy: 0.9230999823664258
Std accuracy: 0.019500724212332263
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[332  15   8]
 [ 12 319  20]
 [  7  20 333]]


Random Forest
Mean accuracy: 0.9277552459883619
Std accuracy: 0.022323164897578462
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[327  23   5]
 [ 13 330   8]
 [  4  24 332]]


Naive Bayes
Mean accuracy: 0.9455739728442953
Std accuracy: 0.02093458991066978
Confusion 




### Recursive Feature Elimination

In [7]:
clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=0)
rfe = RFE(clf,10,step=1)
rfe.fit(X, y)
columns = list(zip(rfe.support_, column_names))
columns

[(True, 'alpha_min'),
 (True, 'alpha_max'),
 (True, 'alpha_median'),
 (True, 'alpha_std'),
 (False, 'beta_min'),
 (True, 'beta_max'),
 (True, 'beta_median'),
 (True, 'beta_std'),
 (True, 'gamma_min'),
 (True, 'gamma_max'),
 (False, 'gamma_median'),
 (True, 'gamma_std')]

### Training with Eliminated Features

In [8]:
X_new = X.drop(["beta_min", "gamma_median"], axis=1)
X_new_scaled = StandardScaler().fit_transform(X_new)

In [9]:
# Cerate 10 splits (each containing one subject of each class)
aggregated_frame["subject"] = aggregated_frame["subject"].str.replace(r"[a-zA-Z]",'')
splits = list(LeaveOneGroupOut().split(X, y, groups=aggregated_frame["subject"]))

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_new_scaled, y, silent=False)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

100%|██████████| 10/10 [00:00<00:00, 19.18it/s]

Nearest Neighbors
Mean accuracy: 0.9212112652423909
Std accuracy: 0.04304242670298699
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[343  11   1]
 [ 14 334   3]
 [  8  46 306]]


RBF SVM
Mean accuracy: 0.9251707692362826
Std accuracy: 0.03628190250479269
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[340  13   2]
 [ 18 329   4]
 [  6  36 318]]


Linear SVM
Mean accuracy: 0.9322663853039156
Std accuracy: 0.038074105344245185
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[333  17   5]
 [ 17 327   7]
 [  5  20 335]]


Decision Tree
Mean accuracy: 0.893501976325871
Std accuracy: 0.05781769434430133
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[302  35  18]
 [ 10 322  19]
 [ 13  18 329]]


Random Forest
Mean accuracy: 0.9154766511495869
Std accuracy: 0.03434040664645536
Confusion Matrix:
['Standing', 'Walking', 'Running']
[[331  17   7]
 [ 33 311   7]
 [  7  18 335]]


Naive Bayes
Mean accuracy: 0.9426757933783831
Std accuracy: 0.029765609850852328
Confusion




### Using Porter to convert Model

In [10]:
best_clf = GaussianNB()
best_clf.fit(X, y)
porter = Porter(best_clf, language='js')
export = porter.export(embed_data=True)
f = open("naive_bayes_clf.js", "w")
f.write(export)
f.close()