In [1]:
#construct a decision tree classifier

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import DataProcess as DP

### Data Processing & Feature extraction

In [2]:
for subj_n in range(1,10):
    #load the data
    subj_filename='./PAMAP2_Dataset/Protocol/subject10'+str(subj_n)+'.dat'
    HR_rest,HR_max=DP.HR_lim[subj_n]
    dp=DP.dataprocess(subj_filename,HR_rest,HR_max)
    np.save('data'+str(subj_n)+'.npy', (dp.feat_labels,dp.data_segmented))

In [2]:
datalabels=[]
for i in range(1,10):
    feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
    datalabels.append(datalabelsi)

In [3]:
[x.shape for x in datalabels]

[(1153, 125),
 (1209, 125),
 (807, 125),
 (1061, 125),
 (1262, 125),
 (1155, 125),
 (1051, 125),
 (1245, 125),
 (23, 125)]

In [4]:
# Stack data from different subjects into one chunk:
datalabels=np.vstack(datalabels)

# 1) Decision Tree Classification:

In [5]:
#shuffle data
np.random.shuffle(datalabels)

traindata=datalabels[:,:-1]
trainlabels=datalabels[:,-1]

In [6]:
cls=DecisionTreeClassifier(max_depth=8,min_impurity_decrease=0.015,max_leaf_nodes=14)
cls.fit(traindata,trainlabels)
cross_val_score(cls, traindata, trainlabels)

array([ 0.83093886,  0.81659973,  0.81373534])

#### Find out important nodes:

In [7]:
y=cls.feature_importances_
a=[[feature_names[i],y[i]] for i in range(len(y))]
a.sort(key=lambda x:-x[1])
a[:15]

[['chest_acc16g_y_std', 0.18200077225066538],
 ['chest_acc16g_z_mean', 0.15845163992146358],
 ['chest_acc16g_y_peak', 0.14467783205349496],
 ['hand_acc16g_x_std', 0.11078914282575206],
 ['hand_acc16g_y_peak', 0.10516702476359059],
 ['hand_mag_x_median', 0.10184077345563092],
 ['hand_temp_mean', 0.082141704181639755],
 ['chest_gyro_y_std', 0.045476254611924699],
 ['heart_rate_peak', 0.040702668546070901],
 ['chest_mag_y_mean', 0.028752187389767026],
 ['heart_rate_mean', 0.0],
 ['hand_acc16g_x_mean', 0.0],
 ['hand_acc16g_y_mean', 0.0],
 ['hand_acc16g_z_mean', 0.0],
 ['hand_gyro_x_mean', 0.0]]

In [19]:
from sklearn import tree
from sklearn.externals.six import StringIO
import pydotplus   #using pydotplus in windows10, python 3.6.X
import pydot

In [28]:
dot_data = StringIO()
class_name=[DP.activity_dict[x] for x in np.unique(trainlabels)]

tree.export_graphviz(cls, out_file=dot_data, 
                         feature_names=feature_names[:-1],  
                         class_names=class_name, 
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
# file_path='tree.png'
# pydot.graph_from_dot_data(dot_data.getvalue()).write_png(file_path)
# i = misc.imread(file_path)
# plt.imshow(i)

#### LOSO cross validation:

In [106]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [22]:
#check LOSO cross validation!

mean_scores=[]
for a in [14]:
    scores=[]
    for iout in range(1,9):

        datalabels=[]
        for i in range(1,iout):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        for i in range(iout+1,9):
            feature_names,datalabelsi=np.load('data'+str(i)+'.npy')
            datalabels.append(datalabelsi)

        datalabels=np.vstack(datalabels)

        feature_names,validdatalabels=np.load('data'+str(iout)+'.npy')
        validdata=validdatalabels[:,:-1]
        validlabels=validdatalabels[:,-1]

        #shuffle data
        np.random.shuffle(datalabels)

        traindata=datalabels[:,:-1]
        trainlabels=datalabels[:,-1]

        cls=DecisionTreeClassifier(max_depth=8,min_impurity_decrease=0.015,max_leaf_nodes=14)
        cls.fit(traindata,trainlabels)

        scores.append(cls.score(validdata,validlabels))
    mean_scores.append(np.mean(scores))
    print('a=',a,'mean_score=',np.mean(scores))

a= 14 mean_score= 0.677375244675


In [32]:
print('validation scores:', scores)

validation scores: [0.83174327840416307, 0.73035566583953682, 0.6406443618339529, 0.69651272384542884, 0.62599049128367668, 0.79567099567099564, 0.75832540437678397, 0.33975903614457831]


In [26]:
y_pred=cls.predict(validdata)

confusion_matrix(validlabels, y_pred)

array([[  0, 117,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0],
       [  0, 104,   0,   0,   0,   0,   0,   0,   0,   7,   0,   0],
       [ 51,   6,  52,   0,   0,   0,   0,   0,   0,  14,   0,   0],
       [  0,   0,   0,  67,   0,   0,   0,   0,  90,   0,   0,   0],
       [ 18,   0,   0,   1,   0,   0,   0,   0,  58,   0,   0,   0],
       [  0,   0,   0,   2,   0, 102,   0,   0,   9,   0,  12,   0],
       [  0,   0,   0,  35,   0,   0,   0,   0, 108,   0,   0,   0],
       [  0,   0,   0,   9,   0,   0,   0,   0,  28,   2,   2,   0],
       [  0,   0,   0,   9,   0,   0,   0,   0,  20,   1,   0,   0],
       [  0,   0,   1,   2,   0,   0,   0,   0,   0,  78,  38,   0],
       [  0,   5,  10,   0,   0,   0,   0,   0,   0, 150,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0,   0,  33,   0,   2,   0]], dtype=int64)

### Changing parameters in DecisionTreeClassifier:

In [31]:
X=datalabels[:,:98]
y=datalabels[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

**Change depth of tree:**

In [66]:
cv_scores = []
for n in [5, 10, 15]:
    cls=DecisionTreeClassifier(max_depth=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.8166249 ,  0.77434183,  0.78490566]),
 array([ 0.91478697,  0.92352695,  0.90985325]),
 array([ 0.90685046,  0.92770581,  0.91027254])]

**Change minimum number of sample required for a leaf node: (default = 1)**

In [68]:
cv_scores = []
for n in [1, 4, 8]:
    cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.91060986,  0.92310907,  0.9115304 ]),
 array([ 0.90726817,  0.91558713,  0.90733753]),
 array([ 0.90392648,  0.90472211,  0.90356394])]

** Change maximum number of leaf nodes: (default = None)**

In [72]:
cv_scores = []
for n in [12, 20, 30, 40, None]:
    cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=1, max_leaf_nodes=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.78822055,  0.77893857,  0.77484277]),
 array([ 0.82915622,  0.83159214,  0.82641509]),
 array([ 0.86758563,  0.87547012,  0.85744235]),
 array([ 0.88220551,  0.89134977,  0.8754717 ]),
 array([ 0.91060986,  0.92310907,  0.9115304 ])]

**Testing with test samples:**

In [73]:
cls=DecisionTreeClassifier(max_depth=12, min_samples_leaf=1, max_leaf_nodes=None, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.94202898550724634

# 2) Random Forest Classifier:

In [33]:
from sklearn.ensemble import RandomForestClassifier

#### Changing number of trees: (default n_estimators=10)

In [65]:
cv_scores = []
for n in [5, 10, 15, 20]:
    cls=RandomForestClassifier(n_estimators=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.95238095,  0.93773506,  0.9408805 ]),
 array([ 0.96867168,  0.96113665,  0.96436059]),
 array([ 0.97410192,  0.97074802,  0.96603774]),
 array([ 0.97869674,  0.97158379,  0.97232704])]

**Change depth of trees: (default max_depth=None)**

In [50]:
cv_scores = []
for n in [1, 2, 3, 4, 7, 10, 15, None]:
    cls=RandomForestClassifier(n_estimators=12, max_depth=n, random_state=42)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.35839599,  0.35812787,  0.35639413]),
 array([ 0.44444444,  0.51358128,  0.4524109 ]),
 array([ 0.67710944,  0.71291266,  0.71530398]),
 array([ 0.80743525,  0.8069369 ,  0.7932914 ]),
 array([ 0.92940685,  0.94149603,  0.92285115]),
 array([ 0.96073517,  0.97033013,  0.95345912]),
 array([ 0.9657477 ,  0.97033013,  0.96771488]),
 array([ 0.96908939,  0.96573339,  0.96771488])]

**Testing with test samples:**

In [51]:
cls=RandomForestClassifier(n_estimators=12, max_depth=12, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.97547380156075814

# 3) K-Nearest Neighbors Classifier:

In [53]:
from sklearn.neighbors import KNeighborsClassifier

#### Change number of nearest neighbors: (default n_neighbors=5)

In [57]:
cv_scores = []
for n in [1, 2, 3, 5, 7, 9]:
    cls=KNeighborsClassifier(n_neighbors=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.93817878,  0.93815295,  0.92997904]),
 array([ 0.91102757,  0.91558713,  0.91446541]),
 array([ 0.91854637,  0.92478061,  0.91991614]),
 array([ 0.91102757,  0.9164229 ,  0.91236897]),
 array([ 0.90309106,  0.91307982,  0.90901468]),
 array([ 0.89807853,  0.90597576,  0.90398323])]

**Change the weight function used in prediction: (default='uniform')**

In [74]:
cv_scores = []
for n in ['uniform', 'distance']:
    cls=KNeighborsClassifier(n_neighbors=3, weights=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.91854637,  0.92478061,  0.91991614]),
 array([ 0.92815372,  0.93063101,  0.92494759])]

**Testing with test samples:**

In [75]:
cls=KNeighborsClassifier(n_neighbors=3, weights='distance')
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.94425863991081382

# 4) Support Vector Machine:

In [63]:
from sklearn.svm import SVC

**Change kernel type: [‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’]**

In [79]:
cv_scores = []
for n in ['linear','poly', 'rbf', 'sigmoid']:
    cls=SVC(kernel=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.96825397,  0.96615127,  0.96477987]),
 array([ 0.9695071 ,  0.96865859,  0.9706499 ]),
 array([ 0.47660819,  0.4755537 ,  0.48050314]),
 array([ 0.13074353,  0.13079816,  0.13081761])]

**Change parameters correspond to different kernel: gamma and degree. Both parameters don't seem to affect too much.**

In [81]:
cv_scores = []
for n in ['auto', 1/100, 1/50, 1/200]:
    cls=SVC(kernel='linear', gamma=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.96825397,  0.96615127,  0.96477987]),
 array([ 0.96825397,  0.96615127,  0.96477987]),
 array([ 0.96825397,  0.96615127,  0.96477987]),
 array([ 0.96825397,  0.96615127,  0.96477987])]

In [83]:
cv_scores = []
for n in [1, 2, 3, 4, 5]:
    cls=SVC(kernel='poly', degree=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.96741855,  0.96740493,  0.96519916]),
 array([ 0.9732665 ,  0.96907647,  0.97148847]),
 array([ 0.9695071 ,  0.96865859,  0.9706499 ]),
 array([ 0.96282373,  0.96698705,  0.9672956 ]),
 array([ 0.96073517,  0.96406185,  0.96268344])]

**Testing with test samples:**

In [84]:
cls=SVC(kernel='poly', degree=2)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.97770345596432551

# 5) Gaussian Naive Bayes:

In [35]:
from sklearn.naive_bayes import GaussianNB

In [91]:
cls=GaussianNB()
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.91806020066889638

**It seems that GaussianNB is not doing super well. Is it because it assumes that the features are independent?**

# 6) Gradient Boost classifier:

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

**Change learning_rate: default = 0.1**

In [97]:
cv_scores = []
for n in [0.05, 0.1, 0.5]:
    cls=GradientBoostingClassifier(learning_rate=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.98245614,  0.97325533,  0.96855346]),
 array([ 0.98746867,  0.9799415 ,  0.97651992]),
 array([ 0.98120301,  0.97910573,  0.97400419])]

**Change the number of boosting stages to perform: default n_estimators = 100**

In [98]:
cv_scores = []
for n in [20, 50, 120]:
    cls=GradientBoostingClassifier(learning_rate=0.1, n_estimators=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.96324144,  0.95528625,  0.94465409]),
 array([ 0.98287385,  0.97450898,  0.96813417]),
 array([ 0.98913952,  0.98203092,  0.97861635])]

**Change the maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree.**

In [103]:
cv_scores = []
for n in [1, 3, 5]:
    cls=GradientBoostingClassifier(learning_rate=0.2, n_estimators=75, max_depth=n)
    cv_scores.append(cross_val_score(cls, X_train, y_train))
    
cv_scores

[array([ 0.9770259 ,  0.96782282,  0.96352201]),
 array([ 0.98788638,  0.98035938,  0.97903564]),
 array([ 0.98746867,  0.97868784,  0.9769392 ])]

**Tesing with the test samples:**

In [102]:
cls=GradientBoostingClassifier(learning_rate=0.2, n_estimators=75, max_depth=3)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.99163879598662208

# 7) Neural Network Multi-layer Perceptron classifier:

In [104]:
from sklearn.neural_network import MLPClassifier

In [105]:
clf = MLPClassifier(activation='relu', solver='adam', alpha=1e-5, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.98996655518394649

In [108]:
y_pred=cls.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[200,   0,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0],
       [  0, 161,   1,   0,   0,   0,   0,   0,   0,   2,   0,   0],
       [  0,   0, 190,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0, 228,   0,   0,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   0,  95,   0,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   0,   0, 160,   0,   1,   0,   1,   0,   0],
       [  0,   0,   0,   1,   0,   0, 176,   0,   0,   1,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0,  83,   0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   1,  70,   1,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0,   1,   0, 151,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 217,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0,  45]], dtype=int64)

In [109]:
clf = MLPClassifier(hidden_layer_sizes=(25, 2), activation='relu', solver='adam', alpha=1e-5, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.98885172798216281

In [112]:
clf = MLPClassifier(hidden_layer_sizes=(10, 20, 10, 2), activation='relu', solver='adam', alpha=1e-5, random_state=42)
cls.fit(X_train,y_train)
cls.score(X_test,y_test)

0.98773690078037901