In [61]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import pywt
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split    

## 加载数据文件

In [62]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    2: 'transverse',
}


def readFile(filename):
    return np.loadtxt(filename)

In [63]:
folderPath = '../../data/Final_Version/all/datasets/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals.shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])

dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (4088, 64, 3)
{'normal': 3601, 'pothole': 474, 'transverse': 13}


## 小波特征提取

In [64]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    return entropy


def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]


def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics


def extract_features(dataset, waveletname):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            # 小波变换，返回长度为5，元素为单列array的list
            list_coeff = pywt.wavedec(signal, waveletname)
            # print(len(list_coeff))
            for coeff in list_coeff:
                # 对小波变换后的系数取其特征：entropy+statistics+crossings
                features += get_features(coeff)
                # print(len(features))
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [65]:
#waveletname = 'rbio3.1'
#waveletname = 'haar'
#waveletname = 'sym5'
#waveletname = 'db6'
waveletname = 'db10'
features = extract_features(
    signals, waveletname)

In [66]:
print(features.shape)
labels = np.array(anomalyType)

(4088, 72)


In [67]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[0.03912839, 0.18965866, 0.18965866, ..., 0.00657882, 0.00410767,
        0.00472577],
       [0.04118145, 0.16634165, 0.18852054, ..., 0.00728029, 0.00477956,
        0.00461288],
       [0.03958348, 0.1598871 , 0.1598871 , ..., 0.00918575, 0.00791603,
        0.0065184 ],
       ...,
       [0.04765539, 0.12832764, 0.12832764, ..., 0.01141782, 0.01015889,
        0.00868853],
       [0.04598632, 0.14859974, 0.17336636, ..., 0.00687021, 0.00381156,
        0.00530571],
       [0.05058995, 0.16347586, 0.16347586, ..., 0.00576659, 0.00244098,
        0.00386836]])

## 信号分类

In [68]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [69]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.959968542467669
Accuracy on test set is : 0.9551263243683781
              precision    recall  f1-score   support

         0.0      0.963     0.988     0.975    108139
         1.0      0.882     0.731     0.800     14175
         2.0      0.188     0.016     0.029       386

    accuracy                          0.955    122700
   macro avg      0.677     0.578     0.601    122700
weighted avg      0.951     0.955     0.952    122700



### Support Vector Machine

In [70]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.9372492135616918
Accuracy on test set is : 0.9358190709046457
              precision    recall  f1-score   support

         0.0      0.933     0.999     0.965    108110
         1.0      0.982     0.483     0.647     14171
         2.0      0.000     0.000     0.000       419

    accuracy                          0.936    122700
   macro avg      0.638     0.494     0.537    122700
weighted avg      0.936     0.936     0.925    122700



### Random Forest

In [71]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of RandomForest
Accuracy on training set is : 0.999982523593149
Accuracy on test set is : 0.954588427057865
              precision    recall  f1-score   support

         0.0      0.963     0.987     0.975    107973
         1.0      0.879     0.735     0.801     14326
         2.0      0.000     0.000     0.000       401

    accuracy                          0.955    122700
   macro avg      0.614     0.574     0.592    122700
weighted avg      0.950     0.955     0.951    122700

