# AndyData 08/13/2020


In [1]:
import sklearn
assert sklearn.__version__ >= "0.21", "Use the conda_cpac kernel!"

# Standard library
import os
import re

# Third party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.utils import parallel_backend

# Local
import utils

# matplotlib hackery
%matplotlib inline

In [2]:
df_orig = utils.load_dataset("s3://cpac/ORIG/AndyData/AndyData_08_13_2020.csv")
df_orig.describe()

Unnamed: 0,ParticipantID,Sequence,Trial,timestamp,orientation_T8_q0,orientation_T8_q1,orientation_T8_q2,orientation_T8_q3,orientation_RightUpperLeg_q0,orientation_RightUpperLeg_q1,...,jointAngle_jT9T8_x,jointAngle_jT9T8_y,jointAngle_jT9T8_z,jointAngle_jRightHip_x,jointAngle_jRightHip_y,jointAngle_jRightHip_z,jointAngle_jLeftHip_x,jointAngle_jLeftHip_y,jointAngle_jLeftHip_z,t_video
count,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,...,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,314344.0,31439.0
mean,1.447659,4.40971,1.947007,1523121000.0,0.595104,-0.033517,0.124574,0.120162,0.567625,0.051015,...,0.564447,0.031747,3.958778,5.474374,-5.69571,10.486858,6.544351,-9.199373,11.214838,56.55184
std,0.497254,1.083229,0.828869,5593497.0,0.392911,0.226485,0.228513,0.597394,0.494934,0.159237,...,0.760874,0.918489,3.145551,9.212882,7.936679,27.994923,9.321108,9.480311,27.80247,35.346617
min,1.0,3.0,1.0,1518085000.0,-0.609348,-0.792389,-0.914047,-0.99944,-0.982567,-0.681638,...,-1.600951,-4.086461,0.073836,-16.300463,-33.14045,-28.314121,-8.662225,-36.286657,-26.123699,0.0
25%,1.0,3.0,1.0,1518085000.0,0.149235,-0.03744,0.010368,-0.05477,0.12459,-0.022204,...,0.1396,-0.367433,1.423898,0.889143,-10.361242,-8.239682,0.661641,-16.792758,-9.059988,27.291667
50%,1.0,4.0,2.0,1518087000.0,0.720124,0.009847,0.038492,0.010885,0.835207,-0.000116,...,0.330103,0.189099,2.586307,2.608627,-4.674045,-4.583296,3.499716,-9.333517,-2.690533,54.583333
75%,2.0,5.0,3.0,1529334000.0,0.998528,0.041856,0.148271,0.64317,0.998491,0.063539,...,0.888811,0.4463,6.955912,6.771196,0.705641,25.928286,9.591413,-1.475992,26.310191,81.854167
max,2.0,6.0,3.0,1529336000.0,0.999964,0.723482,0.914586,0.999936,0.999978,0.680975,...,4.891509,4.20476,10.750022,46.62774,20.645715,98.027701,44.721174,18.562071,96.878583,167.208333


In [4]:
test_selector = df_orig["ParticipantID"] == 2
df_train, df_test = df_orig[~test_selector], df_orig[test_selector]

scaler = StandardScaler()
X_train = scaler.fit_transform(df_train.loc[:, "orientation_T8_q0":"jointAngle_jLeftHip_z"])
y_train = df_train["mode"]
X_test = scaler.transform(df_test.loc[:, "orientation_T8_q0":"jointAngle_jLeftHip_z"])
y_test = df_test["mode"]

print(f"Train: {X_train.shape} -> {y_train.shape}")
print(f"Test: {X_test.shape} -> {y_test.shape}")

Train: (173625, 66) -> (173625,)
Test: (140719, 66) -> (140719,)


In [5]:
classifiers = {
#    "Nearest Neighbors": KNeighborsClassifier(3),
#    "Linear SVM": SVC(kernel="linear", C=0.025),
#    "RBF SVM": SVC(gamma=2, C=1),
#    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net": MLPClassifier(alpha=1, max_iter=1000),
#    "AdaBoost": AdaBoostClassifier(),
#    "Naive Bayes": GaussianNB(),
#    "QDA": QuadraticDiscriminantAnalysis(),
    "LASSO": LogisticRegression(penalty='l1', C=0.1, solver="liblinear")
}

for name, clf in classifiers.items():
    with parallel_backend('threading'):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(f"{name}: {score:.3f}")

Decision Tree: 0.453
Random Forest: 0.840
Neural Net: 0.781


  " = {}.".format(effective_n_jobs(self.n_jobs)))


LASSO: 0.710
