In [12]:
import pandas as pd
import json
import sklearn.svm as svm
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import script as sc

user = "Hans"

In [13]:
data = pd.DataFrame(sc.get_complete(user))

In [14]:
copy = data[['mean.vm','sd.vm','mean.ang','sd.ang','p625','dfreq','ratio.df']].copy(deep=True)
copy.loc[-1] = copy.loc[0]  # adding a row
copy.index = copy.index + 1  # shifting index
copy.sort_index(inplace=True)
copy.columns = 'last.' + copy.columns
data = pd.concat([data, copy], axis = 1)
data = data.drop(data.index[len(data)-1])

In [16]:
## shuffle the data (for training and testing)
observedData = data.sample(frac = 1)

n = int(0.75 * len(observedData))

train = observedData[:n]
test = observedData[n:]
## this might be an issue since there is ordinality and that makes things weird.  The later parts of this trial
## were more likely to be sedentary.
trainX = train[['mean.vm','sd.vm','mean.ang','sd.ang','p625','dfreq','ratio.df','last.mean.vm', 
                'last.sd.vm', 'last.mean.ang', 'last.sd.ang', 'last.p625', 'last.dfreq', 'last.ratio.df']]
trainY = train['coding']
testX = test[['mean.vm','sd.vm','mean.ang','sd.ang','p625','dfreq','ratio.df','last.mean.vm', 
              'last.sd.vm', 'last.mean.ang', 'last.sd.ang', 'last.p625', 'last.dfreq', 'last.ratio.df']]
testY = test['coding']

trainX = preprocessing.scale(trainX)
testX = preprocessing.scale(testX)

In [19]:
clf = svm.SVC(cache_size=7000)
clf.fit(trainX, trainY)
## get accuracy of the svm
clf.score(testX,testY)

0.80254311095627939

In [20]:
## get an f1-score (better to show accuracy)
predictedVals = clf.predict(testX)
print (confusion_matrix(testY,predictedVals,labels = ['sedentary','non-sedentary']))
f1_score(testY == 'sedentary', predictedVals == 'sedentary',labels = ['sedentary','non-sedentary'])

[[ 9197  3396]
 [ 2272 13840]]


0.76444185853212532

In [22]:
## a KNN approach
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=200,weights = 'distance')
neigh.fit(trainX, trainY)
neigh2.score(testX,testY)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=200, p=2,
           weights='distance')

In [27]:
neigh.score(testX,testY)

0.79843232886256754

In [23]:
predictedVals = neigh.predict(testX)
print (confusion_matrix(testY,predictedVals,labels = ['sedentary','non-sedentary']))
f1_score(testY == 'sedentary', predictedVals == 'sedentary',labels = ['sedentary','non-sedentary'])

[[ 9266  3327]
 [ 2459 13653]]


0.76206924911588125

In [24]:
## random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, random_state=0)
rf.fit(trainX,trainY)
rf.score(testX,testY)

0.78355687162515242

In [25]:
predictedVals = rf.predict(testX)
print (confusion_matrix(testY,predictedVals,labels = ['sedentary','non-sedentary']))
f1_score(testY == 'sedentary', predictedVals == 'sedentary',labels = ['sedentary','non-sedentary'])

[[ 8693  3900]
 [ 2313 13799]]


0.73672613246324004