In [1]:
import pandas as pd
import json
import sklearn.svm as svm
import numpy as np
from sklearn.utils import shuffle

In [2]:
## Change the name to get your path
user = 'Andrew'
file1 = 'AG01-01.csv'
file2 = 'GTH01-01.csv'

In [3]:
def load_Data(user, filename):
   data = json.load(open('dir.json'))
   dir = data[user] + filename
#    data = pd.read_csv(dir, header=2)
   return dir

In [6]:
data = pd.read_csv(load_Data(user, file1), header=0)
data['start.time'] = pd.to_datetime(data['start.time'])
## selecting the times when the ground truth was observed
observedData = data.copy(deep=True)
observedData = observedData[(observedData['start.time'] >= '2017-10-03 16:44:00') & (observedData['start.time'] <= '2017-10-03 18:44:00')]
observedData.loc[:,'index'] = range(len(observedData))

In [7]:
gt = pd.read_csv(load_Data(user, file2), header=0)
gt['index'] = gt['time']
# gt.head()

In [8]:
observedData = pd.merge(observedData,gt[['index','coding']])

In [9]:
observedData = shuffle(observedData)
pivot = round(len(observedData) * 0.7)
train = observedData[:pivot]
test = observedData[pivot:]
## this might be an issue since there is ordinality and that makes things weird.  The later parts of this trial
## were more likely to be sedentary.
X = train[['mean.vm','sd.vm','mean.ang','sd.ang','p625','dfreq','ratio.df']]
y = train['coding']

testX = test[['mean.vm','sd.vm','mean.ang','sd.ang','p625','dfreq','ratio.df']]
testY = test['coding']

In [10]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
clf.predict([[1,.0004,1,1,1,9,1],[.5,.002,.1,3,1,.5,1]])

array(['non-sedentary', 'sedentary'], dtype=object)

In [12]:
## get accuracy of the svm
clf.score(testX,testY)

0.74242424242424243

In [13]:
linearKernel = svm.LinearSVC()
linearKernel.fit(X, y)
linearKernel.score(testX, testY)

0.65013774104683197

In [17]:
## what if we tried a neural net for fun?
from sklearn.neural_network import MLPClassifier

In [18]:
n_net = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(2000, 1000), random_state=1)
n_net.fit(X,y)
## test accuracy of neural net
np.sum(n_net.predict(X) == y) / len(X)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2000, 1000), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

## Run GridSearch for parameter tuning on Neural Network

In [14]:
from sklearn.model_selection import GridSearchCV

In [18]:
parameters = {'activation': ['logistic', 'relu', 'tanh'], 
              'hidden_layer_sizes': [(2000, 1000), (1000, 500, 250, 125), (500, 400, 300, 200, 100, 50),
                                     (200, 200, 150, 150, 100, 100, 50, 50, 25, 25, 10, 10)]}
model = MLPClassifier()

In [19]:
nn_grid = GridSearchCV(model, parameters)

In [20]:
nn_grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'activation': ['logistic', 'relu', 'tanh'], 'hidden_layer_sizes': [(2000, 1000), (1000, 500, 250, 125), (500, 400, 300, 200, 100, 50), (200, 200, 150, 150, 100, 100, 50, 50, 25, 25, 10, 10)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
nn_grid.best_params_

{'activation': 'logistic', 'hidden_layer_sizes': (1000, 500, 250, 125)}

In [22]:
nn_grid.best_score_

0.67066692897894942

## Run GridSearch for parameter tuning on SVM

In [23]:
clf = svm.SVC(cache_size=7000)
parameter_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 
                  'kernel': ['linear', 'rbf', 'sigmoid'], 
                  'gamma': [0.001, 0.01, 0.1, 1]}
svm_search = GridSearchCV(clf, parameter_grid)

In [24]:
svm_search.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
print(svm_search.best_params_)
print(svm_search.best_score_)

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.717883139878
