In [66]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import pywt
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split    

## 加载数据文件

In [67]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    2: 'transverse',
}


def readFile(filename):
    return np.loadtxt(filename)

In [68]:
folderPath = '../../data/Final_Version/all/datasets/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals.shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])

dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (4088, 64, 3)
{'normal': 3601, 'pothole': 474, 'transverse': 13}


## 特征提取

### 时域特征提取

In [69]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    return entropy


def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]


def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics


def extract_features1(dataset):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features(signal)
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [70]:
features1 = extract_features1(signals)

### 频域特征提取

In [71]:
#FFT
def get_fft_values(y_values, N, f_s):
    f_values = np.linspace(0.0, f_s/2.0, N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return fft_values

#PSD
def get_psd_values(y_values, N, f_s):
    f_values, psd_values = welch(y_values, fs=f_s)
    return  psd_values

#Autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[len(result)//2:]
 
def get_autocorr_values(y_values, N, f_s):
    autocorr_values = autocorr(y_values)
    x_values = np.array([ 1.0*jj/f_s for jj in range(0, N)])
    return autocorr_values


In [72]:
def extract_features2(dataset, N, f_s):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features(get_fft_values(signal, N, f_s))
            features += get_features(get_psd_values(signal, N, f_s))
            features += get_features(get_autocorr_values(signal, N, f_s))
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [73]:
N = 64 #样本数
f_s = 50 #采样频率
features2 = extract_features2(
    signals, N, f_s)

In [74]:
print(features1.shape)
print(features2.shape)
features = np.hstack((features1,features2))
print(features.shape)
labels = np.array(anomalyType)

(4088, 36)
(4088, 108)
(4088, 144)


In [75]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[0.03575385, 0.19112452, 0.19112452, ..., 0.07460356, 0.64065675,
        0.04425083],
       [0.03739319, 0.15365001, 0.15365001, ..., 0.06892976, 0.52569022,
        0.03518355],
       [0.04312016, 0.19802731, 0.19802731, ..., 0.05407852, 0.28059378,
        0.02499524],
       ...,
       [0.0051629 , 0.0137993 , 0.0137993 , ..., 0.03522252, 0.9889549 ,
        0.02238954],
       [0.02768039, 0.11433838, 0.11433838, ..., 0.07276565, 0.78724462,
        0.03910679],
       [0.04581059, 0.14546949, 0.14546949, ..., 0.08898191, 0.70757898,
        0.05596641]])

## 信号分类

In [76]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [77]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.9566340440405453
Accuracy on test set is : 0.9536430317848418
              precision    recall  f1-score   support

         0.0      0.963     0.986     0.975    108016
         1.0      0.871     0.732     0.795     14293
         2.0      0.000     0.000     0.000       391

    accuracy                          0.954    122700
   macro avg      0.611     0.573     0.590    122700
weighted avg      0.949     0.954     0.951    122700



### Support Vector Machine

In [78]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.9467458930443907
Accuracy on test set is : 0.9438549307253464
              precision    recall  f1-score   support

         0.0      0.945     0.994     0.969    107922
         1.0      0.929     0.594     0.724     14334
         2.0      0.000     0.000     0.000       444

    accuracy                          0.944    122700
   macro avg      0.625     0.529     0.564    122700
weighted avg      0.940     0.944     0.937    122700



### Random Forest

In [82]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of RandomForest
Accuracy on training set is : 0.9999720377490386
Accuracy on test set is : 0.9557946210268947
              precision    recall  f1-score   support

         0.0      0.965     0.987     0.976    108062
         1.0      0.874     0.749     0.807     14239
         2.0      0.000     0.000     0.000       399

    accuracy                          0.956    122700
   macro avg      0.613     0.579     0.594    122700
weighted avg      0.951     0.956     0.953    122700



## 跨数据集

### Logistic Regression

In [86]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Logistic Reg")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Logistic Reg
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.936     0.998     0.966     24615
         1.0      0.825     0.331     0.472      2088
         2.0      0.000     0.000     0.000       374

    accuracy                          0.933     27077
   macro avg      0.587     0.443     0.479     27077
weighted avg      0.915     0.933     0.915     27077

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.971     0.983     0.977     83458
         1.0      0.872     0.799     0.834     12165
         2.0      0.000     0.000     0.000         0

    accuracy                          0.959     95623
   macro avg      0.614     0.594     0.604     95623
weighted avg      0.959     0.959     0.959     95623



### Support Vector Machine

In [87]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Support Vector Machine")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Support Vector Machine
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.915     1.000     0.956     24535
         1.0      0.867     0.093     0.168      2103
         2.0      0.000     0.000     0.000       390

    accuracy                          0.915     27028
   macro avg      0.594     0.364     0.375     27028
weighted avg      0.898     0.915     0.881     27028

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.956     0.993     0.974     83610
         1.0      0.934     0.687     0.791     12062

    accuracy                          0.954     95672
   macro avg      0.945     0.840     0.883     95672
weighted avg      0.954     0.954     0.951     95672



### Random Forest

In [85]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf =RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of RandomForest")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of RandomForest
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.931     0.998     0.963     24679
         1.0      0.819     0.266     0.402      2044
         2.0      0.000     0.000     0.000       399

    accuracy                          0.928     27122
   macro avg      0.583     0.421     0.455     27122
weighted avg      0.909     0.928     0.907     27122

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.975     0.984     0.980     83298
         1.0      0.885     0.828     0.856     12280

    accuracy                          0.964     95578
   macro avg      0.930     0.906     0.918     95578
weighted avg      0.963     0.964     0.964     95578

