In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing

# from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('./data/processed_data/all/resample_to_50_per_act/frequency_features/freq_w50.csv')

In [3]:
df.shape

(5026, 34)

In [4]:
df_data = np.array(df.drop('label',axis=1))

In [5]:
df_data.shape

(5026, 33)

In [6]:
df_labels = df['label']

In [7]:
df_labels.shape

(5026,)

In [8]:
train_data, test_data, train_labels, test_labels = train_test_split(df_data, df_labels, test_size=0.1)

### 1. kNN

In [9]:
#knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn = KNeighborsClassifier()

In [10]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
# cv = KFold(n_splits=20)
scores = cross_val_score(knn, df_data, df_labels, cv=cv)

In [11]:
scores

array([ 0.73359841,  0.74751491,  0.7693837 ,  0.77733598,  0.76143141,
        0.75347913,  0.75745527,  0.75347913,  0.77534791,  0.75149105])

In [12]:
scores.mean()

0.75805168986083493

In [13]:
knn.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [14]:
knn_labels = knn.predict(test_data)

In [15]:
knn_labels.size

503

In [16]:
test_labels.size

503

In [17]:
#knn_labels[0] == test_labels[0]

In [18]:
print(classification_report(test_labels, knn_labels,digits=5))

             precision    recall  f1-score   support

      bweep    0.72289   0.81633   0.76677       147
      clean    0.88710   0.73333   0.80292        75
      daily    0.71591   0.69231   0.70391        91
       dump    0.81132   0.76786   0.78899        56
        run    0.71429   0.83333   0.76923         6
      sweep    0.79439   0.80189   0.79812       106
       walk    0.75000   0.68182   0.71429        22

avg / total    0.77211   0.76740   0.76760       503



### 2. DesicisionTree

In [19]:
dt = DecisionTreeClassifier(criterion='entropy')

In [20]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(dt, df_data, df_labels, cv=cv)

In [21]:
scores

array([ 0.68787276,  0.67992048,  0.73161034,  0.71968191,  0.70974155,
        0.70377734,  0.71371769,  0.72564612,  0.69980119,  0.70974155])

In [22]:
scores.mean()

0.70815109343936389

In [23]:
dt.fit(train_data, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
dt_labels = dt.predict(test_data)

In [25]:
print(classification_report(test_labels, dt_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.64671   0.73469   0.68790       147
      clean    0.66197   0.62667   0.64384        75
      daily    0.63441   0.64835   0.64130        91
       dump    0.84783   0.69643   0.76471        56
        run    0.71429   0.83333   0.76923         6
      sweep    0.78788   0.73585   0.76098       106
       walk    0.85000   0.77273   0.80952        22

avg / total    0.70860   0.70179   0.70314       503



### 3. Naive Bayes

In [26]:
gnb = GaussianNB()
#gnb = BernoulliNB()

In [27]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(gnb, df_data, df_labels, cv=cv)

In [28]:
scores

array([ 0.46918489,  0.45924453,  0.48906561,  0.48707753,  0.49701789,
        0.49304175,  0.45725646,  0.49502982,  0.47514911,  0.47514911])

In [29]:
scores.mean()

0.47972166998011934

In [30]:
gnb.fit(train_data, train_labels)

GaussianNB(priors=None)

In [31]:
gnb_labels = gnb.predict(test_data)

In [32]:
print(classification_report(test_labels, gnb_labels, digits=5))

             precision    recall  f1-score   support

      bweep    0.54930   0.26531   0.35780       147
      clean    0.37681   0.34667   0.36111        75
      daily    0.49462   0.50549   0.50000        91
       dump    0.43820   0.69643   0.53793        56
        run    0.50000   0.83333   0.62500         6
      sweep    0.62222   0.79245   0.69710       106
       walk    0.30556   0.50000   0.37931        22

avg / total    0.50544   0.49702   0.47970       503



### 4. SVM

In [33]:
clf = svm.SVC(C=5, cache_size=1000)

In [34]:
# 对数据集进行指定次数的交叉验证并为每次验证效果评测
# cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
# cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
scores = cross_val_score(clf, df_data, df_labels, cv=cv)

In [35]:
scores

array([ 0.80119284,  0.84294235,  0.82107356,  0.80318091,  0.80516899,
        0.8111332 ,  0.80516899,  0.79324056,  0.7972167 ,  0.82306163])

In [36]:
scores.mean()

0.8103379721669981

In [37]:
clf.fit(train_data, train_labels)

SVC(C=5, cache_size=1000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
#这个计算出的实际上是recall的加权平均值
clf.score(test_data, test_labels)

0.80516898608349896

In [39]:
clf_labels =clf.predict(test_data)

In [40]:
#这个计算的是f1-score的加权平均值
f1_score(test_labels, clf_labels, average='weighted')  

0.80568245059011123

In [41]:
print(classification_report(test_labels, clf_labels,digits=5))

             precision    recall  f1-score   support

      bweep    0.72515   0.84354   0.77987       147
      clean    0.86441   0.68000   0.76119        75
      daily    0.76136   0.73626   0.74860        91
       dump    1.00000   0.83929   0.91262        56
        run    0.83333   0.83333   0.83333         6
      sweep    0.83036   0.87736   0.85321       106
       walk    0.90000   0.81818   0.85714        22

avg / total    0.81417   0.80517   0.80568       503

