In [37]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import pywt
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split    

## 加载数据文件

In [38]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    2: 'transverse',
}


def readFile(filename):
    return np.loadtxt(filename)

In [39]:
folderPath = '../../data/Final_Version/all/datasets/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals.shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])

dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (4437, 64, 3)
{'normal': 3848, 'pothole': 563, 'transverse': 26}


## 时域特征提取

In [40]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    return entropy


def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]


def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics


def extract_features(dataset):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features(signal)
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [41]:
features = extract_features(signals)

In [42]:
print(features.shape)
labels = np.array(anomalyType)

(4437, 36)


In [43]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[0.09125097, 0.35475489, 0.39909925, ..., 0.0609016 , 0.16728191,
        0.03384791],
       [0.06537597, 0.33781342, 0.33781342, ..., 0.01601879, 0.01595151,
        0.01034629],
       [0.06601799, 0.35290347, 0.35290347, ..., 0.01404617, 0.01229937,
        0.0115611 ],
       ...,
       [0.07345631, 0.19633274, 0.19633274, ..., 0.02439802, 0.03335103,
        0.02030705],
       [0.06544915, 0.27034842, 0.27034842, ..., 0.01589514, 0.01588743,
        0.01265916],
       [0.08932298, 0.28364118, 0.28364118, ..., 0.01656266, 0.01257286,
        0.01245703]])

## 信号分类

In [44]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [45]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.9522608695652167
Accuracy on test set is : 0.9484534534534537
              precision    recall  f1-score   support

         0.0      0.958     0.986     0.972    115387
         1.0      0.871     0.740     0.800     16999
         2.0      0.500     0.012     0.024       814

    accuracy                          0.948    133200
   macro avg      0.776     0.579     0.599    133200
weighted avg      0.944     0.948     0.944    133200



### Support Vector Machine

In [46]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.92997423510467
Accuracy on test set is : 0.9290765765765768
              precision    recall  f1-score   support

         0.0      0.926     0.998     0.961    115495
         1.0      0.975     0.499     0.660     16908
         2.0      0.000     0.000     0.000       797

    accuracy                          0.929    133200
   macro avg      0.634     0.499     0.540    133200
weighted avg      0.927     0.929     0.917    133200



### Random Forest

In [47]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of RandomForest
Accuracy on training set is : 0.999987117552335
Accuracy on test set is : 0.9491516516516519
              precision    recall  f1-score   support

         0.0      0.959     0.984     0.972    115593
         1.0      0.866     0.747     0.802     16854
         2.0      1.000     0.058     0.110       753

    accuracy                          0.949    133200
   macro avg      0.942     0.597     0.628    133200
weighted avg      0.948     0.949     0.945    133200



## 跨数据集

### LR

In [48]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Logistic Reg")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Logistic Reg
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.929     0.998     0.962     29138
         1.0      0.795     0.324     0.460      2540
         2.0      0.500     0.013     0.026       670

    accuracy                          0.924     32348
   macro avg      0.741     0.445     0.483     32348
weighted avg      0.910     0.924     0.903     32348

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.971     0.982     0.976     86468
         1.0      0.879     0.821     0.849     14326
         2.0      0.000     0.000     0.000        58

    accuracy                          0.958    100852
   macro avg      0.617     0.601     0.608    100852
weighted avg      0.957     0.958     0.958    100852



### SVM

In [49]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Support Vector Machine")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Support Vector Machine
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.912     0.999     0.953     29242
         1.0      0.822     0.114     0.200      2443
         2.0      0.000     0.000     0.000       704

    accuracy                          0.911     32389
   macro avg      0.578     0.371     0.384     32389
weighted avg      0.885     0.911     0.876     32389

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.933     0.998     0.964     86448
         1.0      0.982     0.568     0.719     14306
         2.0      0.000     0.000     0.000        57

    accuracy                          0.937    100811
   macro avg      0.638     0.522     0.561    100811
weighted avg      0.939     0.937     0.929    100811



### RF

In [50]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf =RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of RandomForest")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of RandomForest
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.925     0.998     0.960     28988
         1.0      0.806     0.293     0.429      2505
         2.0      1.000     0.083     0.153       722

    accuracy                          0.922     32215
   macro avg      0.911     0.458     0.514     32215
weighted avg      0.918     0.922     0.901     32215

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.971     0.981     0.976     86624
         1.0      0.876     0.828     0.851     14305
         2.0      0.000     0.000     0.000        56

    accuracy                          0.959    100985
   macro avg      0.616     0.603     0.609    100985
weighted avg      0.957     0.959     0.958    100985

