In [39]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [40]:
# df = pd.read_csv('./data/processed_data/all/features_from_processed_data/all_data.csv')
df = pd.read_csv('./data/processed_data/all/resample_to_50_per_act/time_features/time_w50.csv')

In [41]:
df.shape

(5026, 58)

In [42]:
raw_df_data = np.array(df.drop('label',axis=1))

In [43]:
raw_df_data.shape

(5026, 57)

In [44]:
df_labels = df['label']

In [45]:
df_labels.shape

(5026,)

In [46]:
pca = PCA(n_components=0.9)
pca.fit(raw_df_data)
df_data = pca.transform(raw_df_data)
df_data.shape
pca.explained_variance_ratio_
pca.explained_variance_

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

(5026, 27)

array([ 0.14441308,  0.12742006,  0.08947059,  0.07081318,  0.06673207,
        0.04188902,  0.0321859 ,  0.02743308,  0.02413862,  0.022988  ,
        0.02177545,  0.02021839,  0.01940706,  0.01797794,  0.01755773,
        0.01725329,  0.01643671,  0.01608013,  0.01554442,  0.01477554,
        0.01462052,  0.01374667,  0.01251437,  0.01197577,  0.01149326,
        0.01021804,  0.0097561 ])

array([ 8.23318351,  7.26438887,  5.10083839,  4.03715437,  3.80448519,
        2.38814939,  1.83496146,  1.56399655,  1.37617502,  1.310577  ,
        1.24144753,  1.1526778 ,  1.10642242,  1.02494661,  1.0009899 ,
        0.98363307,  0.93707897,  0.91674992,  0.88620837,  0.84237353,
        0.83353559,  0.78371638,  0.71346129,  0.68275481,  0.65524606,
        0.58254427,  0.5562082 ])

In [47]:
df_data.mean(axis=0)
df_data.std(axis=0)

array([ -2.29555067e-16,   7.04216276e-17,  -4.02737489e-16,
         6.80624589e-16,   4.14643780e-16,   1.81968558e-16,
        -1.47580582e-16,   1.75777949e-16,   1.72243614e-16,
        -1.86215282e-17,   2.23767593e-17,  -5.59750327e-17,
         1.26551287e-16,  -4.47976978e-17,   6.83672953e-18,
         2.65737823e-17,  -2.54472130e-17,   1.38722654e-17,
        -6.36622116e-17,  -4.54824753e-17,   3.69558918e-17,
         2.19128778e-17,   6.53852000e-18,  -4.37373973e-17,
         2.55355714e-17,  -9.50515256e-17,  -4.52394897e-17])

array([ 2.86906699,  2.69498488,  2.25827888,  2.00906723,  1.95031491,
        1.54521009,  1.35447273,  1.25047406,  1.17298816,  1.14469046,
        1.11409179,  1.07352152,  1.05176151,  1.01229575,  1.00039529,
        0.99168411,  0.96793209,  0.95737533,  0.94129275,  0.91771778,
        0.91289087,  0.8851895 ,  0.84458234,  0.82620758,  0.80939218,
        0.76316995,  0.74571947])

In [48]:
train_data, test_data, train_labels, test_labels = train_test_split(df_data, df_labels, test_size=0.1)

### 1. kNN

In [49]:
#knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn = KNeighborsClassifier()

In [50]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
# cv = KFold(n_splits=20)
scores = cross_val_score(knn, df_data, df_labels, cv=cv)

In [51]:
scores

array([ 0.86878728,  0.91650099,  0.90258449,  0.87673956,  0.89860835,
        0.89065606,  0.88667992,  0.87673956,  0.88270378,  0.90059642])

In [52]:
scores.mean()

0.8900596421471173

In [53]:
knn.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [54]:
knn_labels = knn.predict(test_data)

In [55]:
knn_labels.size

503

In [56]:
test_labels.size

503

In [57]:
#knn_labels[0] == test_labels[0]

In [58]:
print(classification_report(test_labels, knn_labels,digits=5))

             precision    recall  f1-score   support

      bweep    0.87162   0.89583   0.88356       144
      clean    0.89855   0.79487   0.84354        78
      daily    0.85542   0.83529   0.84524        85
       dump    0.84483   0.87500   0.85965        56
        run    1.00000   1.00000   1.00000         6
      sweep    0.88430   0.93860   0.91064       114
       walk    0.88889   0.80000   0.84211        20

avg / total    0.87517   0.87475   0.87409       503



### 2. DesicisionTree

In [59]:
dt = DecisionTreeClassifier(criterion='entropy')

In [60]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(dt, df_data, df_labels, cv=cv)

In [61]:
scores

array([ 0.7693837 ,  0.78926441,  0.75944334,  0.77932406,  0.78926441,
        0.73956262,  0.76143141,  0.75149105,  0.75944334,  0.7693837 ])

In [62]:
scores.mean()

0.76679920477137187

In [63]:
dt.fit(train_data, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [64]:
dt_labels = dt.predict(test_data)

In [65]:
print(classification_report(test_labels, dt_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.77083   0.77083   0.77083       144
      clean    0.71429   0.70513   0.70968        78
      daily    0.66667   0.70588   0.68571        85
       dump    0.67241   0.69643   0.68421        56
        run    1.00000   0.50000   0.66667         6
      sweep    0.84956   0.84211   0.84581       114
       walk    0.77778   0.70000   0.73684        20

avg / total    0.75436   0.75149   0.75172       503



### 3. Naive Bayes

In [66]:
gnb = GaussianNB()
#gnb = BernoulliNB()

In [67]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(gnb, df_data, df_labels, cv=cv)

In [68]:
scores

array([ 0.61630219,  0.56858847,  0.63021869,  0.62823062,  0.58846918,
        0.57654076,  0.59244533,  0.59045726,  0.57654076,  0.58449304])

In [69]:
scores.mean()

0.59522862823061629

In [70]:
gnb.fit(train_data, train_labels)

GaussianNB(priors=None)

In [71]:
gnb_labels = gnb.predict(test_data)

In [72]:
print(classification_report(test_labels, gnb_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.51124   0.63194   0.56522       144
      clean    0.68571   0.61538   0.64865        78
      daily    0.62162   0.27059   0.37705        85
       dump    0.82500   0.58929   0.68750        56
        run    0.75000   1.00000   0.85714         6
      sweep    0.67143   0.82456   0.74016       114
       walk    0.46667   0.70000   0.56000        20

avg / total    0.62926   0.61431   0.60289       503



### 4. SVM

In [73]:
clf = svm.SVC(C=5, cache_size=1000)

In [74]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
# cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
# cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
scores = cross_val_score(clf, df_data, df_labels, cv=cv)

In [75]:
scores

array([ 0.92644135,  0.92047714,  0.90656064,  0.92047714,  0.91848907,
        0.92047714,  0.90457256,  0.90258449,  0.91650099,  0.91650099])

In [76]:
scores.mean()

0.91530815109343955

In [77]:
# T = 10
# ave_scores =[]
# for t in range(T):
#     ave_scores.append( cross_val_score(clf, df_data, df_labels, cv=cv).mean() )
# ave_scores = np.array(ave_scores)
# ave_scores
# ave_scores.mean()

In [78]:
clf.fit(train_data, train_labels)

SVC(C=5, cache_size=1000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
#这个计算出的实际上是recall的加权平均值
clf.score(test_data, test_labels)

0.89264413518886676

In [80]:
clf_labels =clf.predict(test_data)

In [81]:
#这个计算的是f1-score的加权平均值
f1_score(test_labels, clf_labels, average='weighted')  

0.89228636619786417

In [82]:
print(classification_report(test_labels, clf_labels,digits=5))

             precision    recall  f1-score   support

      bweep    0.88158   0.93056   0.90541       144
      clean    0.93151   0.87179   0.90066        78
      daily    0.87059   0.87059   0.87059        85
       dump    0.87273   0.85714   0.86486        56
        run    1.00000   0.83333   0.90909         6
      sweep    0.89744   0.92105   0.90909       114
       walk    0.93750   0.75000   0.83333        20

avg / total    0.89371   0.89264   0.89229       503

