# Übung zu: Kontextsensitive Systeme

In [209]:
#import libs 

import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn_porter import Porter

In [210]:
# read data & show first few lines
data1 = pd.read_csv("data_tableUp.csv", sep=';', decimal=".", index_col=2)
data2 = pd.read_csv("data_tableDown.csv", sep=';', decimal=".", index_col=2)
data3 = pd.read_csv("data_hands2.csv", sep=';', decimal=".", index_col=2)

# make index datetime format
data1.index = pd.to_datetime(data1.index, unit='ms')
data2.index = pd.to_datetime(data2.index, unit='ms')
data3.index = pd.to_datetime(data3.index, unit='ms')

data = pd.concat([data1, data2, data3])

In [211]:
# create windows

# choose features to calc
features = ['min', 'max', 'median', 'std']

# aggregate data with features
aggregated_frame = data.drop(['sampleId', 'activity'], axis=1).groupby(pd.Grouper(freq='1000ms')).aggregate(features).dropna()

# create a new dataframe with re-labled columns (& 1d columns)
aggregated_frame_1d = aggregated_frame.copy()
aggregated_frame_1d.columns = [col[0] + "_" + col[1] for col in aggregated_frame_1d.columns]

# create new dataframe with re-added old labels
aggregated_strings = data[['sampleId', 'activity']].groupby(pd.Grouper(freq='1000ms')).first()
aggregated_frame_1d_withLabels = aggregated_frame_1d.join(aggregated_strings)


In [212]:
#aggregated_frame_1d_withLabels.head()

# Training

In [213]:
# import libs

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneGroupOut, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

#from sklearn_porter import Porter

from tqdm import tqdm

In [214]:
# split the data into target label and depending variables
y = aggregated_frame_1d_withLabels["activity"]
X = aggregated_frame_1d_withLabels.drop(["sampleId", "activity"], axis=1)


In [215]:
y.head()

stamp
2021-10-28 10:03:58    tableUp
2021-10-28 10:03:59    tableUp
2021-10-28 10:04:00    tableUp
2021-10-28 10:04:01    tableUp
2021-10-28 10:04:02    tableUp
Name: activity, dtype: object

In [216]:
X.head()

Unnamed: 0_level_0,alpha_min,alpha_max,alpha_median,alpha_std,beta_min,beta_max,beta_median,beta_std,gamma_min,gamma_max,...,x_median,x_std,y_min,y_max,y_median,y_std,z_min,z_max,z_median,z_std
stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-28 10:03:58,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-28 10:03:59,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,0.0,0.0,0.011043
2021-10-28 10:04:00,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-28 10:04:01,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-28 10:04:02,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [217]:
names = [
        "Nearest Neighbors", 
        "RBF SVM",
        "Linear SVM (via kernel)",
        "Linear SVM (native)",
        "Decision Tree", 
        "Random Forest", 
        "Naive Bayes"
]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="rbf", C=1.0, gamma="auto", random_state=0),
    SVC(kernel="linear", C=1.0, random_state=0),
    LinearSVC(random_state=0, max_iter=10000),
    DecisionTreeClassifier(max_depth=12, random_state=0),
    RandomForestClassifier(max_depth=23, n_estimators=10, random_state=0),
    GaussianNB()
]

column_names = list(X.columns)
X_scaled = StandardScaler().fit_transform(X)

In [218]:
def evaluate(clfs, names, splits, X, y, silent=False):
    scores = np.zeros((len(splits), len(classifiers)))
    confusions = [None] * len(classifiers)
    
    # Iterate over every split
    for split_idx, (train_index, test_index) in tqdm(enumerate(splits), total=len(splits), disable=silent):
        y_train, y_test = y[train_index], y[test_index]
        X_train, X_test = X[train_index], X[test_index]

        
        for clf_idx, (name, clf) in enumerate(zip(names, clfs)):
            clf.fit(X_train, y_train)
            
            cm = confusion_matrix(y_test, clf.predict(X_test), ['tableDown', 'tableUp', 'hands'])
            
            if confusions[clf_idx] is None:
                confusions[clf_idx] = cm
            else:
                confusions[clf_idx] = confusions[clf_idx] + cm
            scores[split_idx, clf_idx] = clf.score(X_test, y_test)
    return np.mean(scores, axis=0), np.std(scores, axis=0), confusions

def printStatistics(names, mean_accuracies, std_accuracies, confusions):
    for name, mean_accuracy, std_accuracy, confusion in zip(names, mean_accuracies, std_accuracies, confusions):
        print(name)
        print('Mean accuracy:', mean_accuracy)
        print('Std accuracy:', std_accuracy)
        print('Confusion Matrix:')
        print(['tableDown', 'tableUp', 'hands'])
        print(confusion)
        print("\n")

In [219]:
# replace subjectlabels with numbers to group them
i = 0
for id_str in aggregated_frame_1d_withLabels["sampleId"].unique().tolist():
    id_str = str(id_str)
    i_str = str(i)
    aggregated_frame_1d_withLabels["sampleId"] = aggregated_frame_1d_withLabels["sampleId"].str.replace(id_str,i_str)
    i += 1

In [220]:
aggregated_frame_1d_withLabels.head()

Unnamed: 0_level_0,alpha_min,alpha_max,alpha_median,alpha_std,beta_min,beta_max,beta_median,beta_std,gamma_min,gamma_max,...,y_min,y_max,y_median,y_std,z_min,z_max,z_median,z_std,sampleId,activity
stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-28 10:03:58,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,tableUp
2021-10-28 10:03:59,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,-0.1,0.0,0.0,0.011043,0,tableUp
2021-10-28 10:04:00,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,tableUp
2021-10-28 10:04:01,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,tableUp
2021-10-28 10:04:02,4.5,4.5,4.5,0.0,-0.701,-0.701,-0.701,0.0,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,tableUp


In [221]:
splits = list(LeaveOneGroupOut().split(X, y, groups=aggregated_frame_1d_withLabels["sampleId"]))

#print(splits)

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_scaled, y, silent=True)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

Nearest Neighbors
Mean accuracy: 0.8106461008880335
Std accuracy: 0.25286155690416434
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[232  16   7]
 [ 24 151  55]
 [  0  49 316]]


RBF SVM
Mean accuracy: 0.8643731952072918
Std accuracy: 0.2246941394514676
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[230   5  20]
 [  9 164  57]
 [  0  15 350]]


Linear SVM (via kernel)
Mean accuracy: 0.992337544827918
Std accuracy: 0.015205218144001283
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[248   0   7]
 [  0 230   0]
 [  0   0 365]]


Linear SVM (native)
Mean accuracy: 0.9069743668388136
Std accuracy: 0.11987923300482864
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[231  19   5]
 [  1 212  17]
 [  9  30 326]]


Decision Tree
Mean accuracy: 0.9579983706274096
Std accuracy: 0.11965553278037008
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[251   1   3]
 [  0 230   0]
 [ 33   2 330]]


Random Forest
Mean accuracy: 0.982008558554122
Std accuracy: 0.04427060683982

In [222]:
# 10-Fold Cross-Validation
splits = list(KFold(n_splits=10, shuffle=True).split(X, y))

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_scaled, y, silent=True)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

Nearest Neighbors
Mean accuracy: 0.9788235294117648
Std accuracy: 0.00880389973358576
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[246   4   5]
 [  0 225   5]
 [  0   4 361]]


RBF SVM
Mean accuracy: 0.9823529411764707
Std accuracy: 0.009485009115645367
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[243   0  12]
 [  0 229   1]
 [  0   2 363]]


Linear SVM (via kernel)
Mean accuracy: 0.991764705882353
Std accuracy: 0.00753308733815629
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[248   0   7]
 [  0 230   0]
 [  0   0 365]]


Linear SVM (native)
Mean accuracy: 0.9800000000000001
Std accuracy: 0.011823383083671667
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[247   4   4]
 [  0 228   2]
 [  2   5 358]]


Decision Tree
Mean accuracy: 0.9941176470588236
Std accuracy: 0.005882352941176449
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[251   1   3]
 [  0 230   0]
 [  0   1 364]]


Random Forest
Mean accuracy: 0.9952941176470589
Std accuracy: 0.0078038230

In [223]:
# Recursive Feature Elimination

clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=0)
rfe = RFE(clf,10,step=1)
rfe.fit(X, y)
columns = list(zip(rfe.support_, column_names))
print(columns)

[(False, 'alpha_min'), (True, 'alpha_max'), (True, 'alpha_median'), (False, 'alpha_std'), (True, 'beta_min'), (True, 'beta_max'), (True, 'beta_median'), (False, 'beta_std'), (True, 'gamma_min'), (True, 'gamma_max'), (False, 'gamma_median'), (True, 'gamma_std'), (False, 'x_min'), (False, 'x_max'), (False, 'x_median'), (False, 'x_std'), (True, 'y_min'), (False, 'y_max'), (False, 'y_median'), (False, 'y_std'), (False, 'z_min'), (True, 'z_max'), (False, 'z_median'), (False, 'z_std')]


In [224]:
# Training with Eliminated Features
X_new = X.drop(["beta_min", "gamma_median"], axis=1)
X_new_scaled = StandardScaler().fit_transform(X_new)

# Cerate splits (each containing one subject of each class)
i = 0
for id_str in aggregated_frame_1d_withLabels["sampleId"].unique().tolist():
    id_str = str(id_str)
    i_str = str(i)
    aggregated_frame_1d_withLabels["sampleId"] = aggregated_frame_1d_withLabels["sampleId"].str.replace(id_str,i_str)
    i += 1
splits = list(LeaveOneGroupOut().split(X, y, groups=aggregated_frame_1d_withLabels["sampleId"]))

mean_accuracies, std_accuracies, confusions = evaluate(classifiers, names, splits, X_new_scaled, y, silent=True)
printStatistics(names, mean_accuracies, std_accuracies, confusions)

Nearest Neighbors
Mean accuracy: 0.783216274863606
Std accuracy: 0.26941857943201347
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[230  19   6]
 [ 31 144  55]
 [  0  63 302]]


RBF SVM
Mean accuracy: 0.8523566747601045
Std accuracy: 0.23576906741390474
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[226   9  20]
 [ 17 162  51]
 [  0  19 346]]


Linear SVM (via kernel)
Mean accuracy: 0.9900145109078454
Std accuracy: 0.01738698151778782
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[247   1   7]
 [  0 230   0]
 [  0   1 364]]


Linear SVM (native)
Mean accuracy: 0.910310048017684
Std accuracy: 0.11341425895195766
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[234  16   5]
 [  1 212  17]
 [ 10  29 326]]


Decision Tree
Mean accuracy: 0.8570036325701682
Std accuracy: 0.24408469970427424
Confusion Matrix:
['tableDown', 'tableUp', 'hands']
[[227   1  27]
 [  0 191  39]
 [ 42   3 320]]


Random Forest
Mean accuracy: 0.9509926791153953
Std accuracy: 0.08913161744839

In [225]:
# we fit the best model
best_clf = classifiers[5]
best_clf.fit(X, y)

# create the model
porter = Porter(rfe, language='js')
export = porter.export(embed_data=True)

# save the model
f = open("model.js", "w")
f.write(export)
f.close()

ValueError: Currently the given estimator 'RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=5, max_features=1,
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=10, n_jobs=None,
                                     oob_score=False, random_state=0, verbose=0,
                                     warm_start=False),
    n_features_to_select=10, step=1, verbose=0)' isn't supported.