In [65]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import pywt
import scipy.stats
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn import svm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split    

## 加载数据文件

In [66]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    2: 'transverse',
}


def readFile(filename):
    return np.loadtxt(filename)

In [67]:
folderPath = '../../data/Final_Version/all/datasets/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals.shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])

dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (4088, 64, 3)
{'normal': 3601, 'pothole': 474, 'transverse': 13}


## 特征提取

### 时域特征提取

In [68]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    return entropy


def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]


def get_features1(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics


def extract_features1(dataset):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features1(signal)
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [69]:
features1 = extract_features1(signals)

### 频域特征提取

In [70]:
#FFT
def get_fft_values(y_values, N, f_s):
    f_values = np.linspace(0.0, f_s/2.0, N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return f_values, fft_values

#PSD
def get_psd_values(y_values, N, f_s):
    f_values, psd_values = welch(y_values, fs=f_s)
    return f_values, psd_values

#Autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[len(result)//2:]
 
def get_autocorr_values(y_values, N, f_s):
    autocorr_values = autocorr(y_values)
    x_values = np.array([ 1.0*jj/f_s for jj in range(0, N)])
    return x_values, autocorr_values

In [71]:
def get_first_n_peaks(x, y, no_peaks=5):
    x_, y_ = list(x), list(y)
    if len(x_) >= no_peaks:
        ans = np.argsort(y)[-5:]
        ans = ans[np.argsort(ans)]
        return list(x[ans]), list(y[ans])
        # return x_[:no_peaks], y_[:no_peaks]
    else:  # 少于5个peaks，以0填充
        missing_no_peaks = no_peaks-len(x_)
        return x_ + [0]*missing_no_peaks, y_ + [0]*missing_no_peaks


def get_features2(x_values, y_values, mph):
    indices_peaks = detect_peaks(y_values, mph=mph)
    peaks_x, peaks_y = get_first_n_peaks(
        x_values[indices_peaks], y_values[indices_peaks])
    return peaks_x + peaks_y


def extract_features2(dataset, N, f_s, denominator):
    percentile = 5
    list_of_features = []

    for signal_no in range(0, len(dataset)):
        features = []  # 5*2*3*3

        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]

            signal_min = np.nanpercentile(signal, percentile)
            signal_max = np.nanpercentile(signal, 100-percentile)
            #ijk = (100 - 2*percentile)/10
            # set minimum peak height
            #mph = signal_min + (signal_max - signal_min)/denominator
            mph = signal_min

            features += get_features2(*get_psd_values(signal, N, f_s), mph)
            features += get_features2(*get_fft_values(signal, N, f_s), mph)
            features += get_features2(*
                                     get_autocorr_values(signal, N, f_s), mph)
        list_of_features.append(features)
    return np.array(list_of_features)

In [72]:
N = 64 #样本数
f_s = 50 #采样频率
denominator = 10
features2 = extract_features2(
    signals, N, f_s, denominator)

In [73]:
print(features1.shape)
print(features2.shape)
features = np.hstack((features1,features2))
print(features.shape)
labels = np.array(anomalyType)

(4088, 36)
(4088, 90)
(4088, 126)


In [74]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[0.04683592, 0.25036444, 0.25036444, ..., 0.11743982, 0.07147435,
        0.08730024],
       [0.04524433, 0.18591063, 0.18591063, ..., 0.07092869, 0.07071514,
        0.05796451],
       [0.04647946, 0.21345474, 0.21345474, ..., 0.06976539, 0.03048049,
        0.02446915],
       ...,
       [0.03630003, 0.09702208, 0.09702208, ..., 0.51388521, 0.12363135,
        0.11689472],
       [0.0431912 , 0.17840832, 0.17840832, ..., 0.13557564, 0.07576014,
        0.0422648 ],
       [0.05485486, 0.17418917, 0.17418917, ..., 0.07023594, 0.01174558,
        0.00096844]])

## 信号分类

In [75]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [76]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.9609821740650125
Accuracy on test set is : 0.9516136919315406
              precision    recall  f1-score   support

         0.0      0.965     0.984     0.974    108057
         1.0      0.849     0.732     0.787     14261
         2.0      0.120     0.068     0.087       382

    accuracy                          0.952    122700
   macro avg      0.645     0.595     0.616    122700
weighted avg      0.949     0.952     0.950    122700



### Support Vector Machine

In [77]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.951237329605033
Accuracy on test set is : 0.9480603096984512
              precision    recall  f1-score   support

         0.0      0.952     0.992     0.971    108056
         1.0      0.906     0.641     0.751     14274
         2.0      0.000     0.000     0.000       370

    accuracy                          0.948    122700
   macro avg      0.619     0.544     0.574    122700
weighted avg      0.944     0.948     0.943    122700



### Random Forest

In [87]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features, labels)
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict, digits=3))

Results of RandomForest
Accuracy on training set is : 0.999982523593149
Accuracy on test set is : 0.9560146699266506
              precision    recall  f1-score   support

         0.0      0.964     0.988     0.976    108002
         1.0      0.886     0.741     0.807     14288
         2.0      0.000     0.000     0.000       410

    accuracy                          0.956    122700
   macro avg      0.616     0.576     0.594    122700
weighted avg      0.951     0.956     0.953    122700



## 跨数据集

### LR

In [79]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Logistic Reg")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Logistic Reg
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.942     0.995     0.968     24644
         1.0      0.762     0.360     0.489      2042
         2.0      0.174     0.041     0.067       389

    accuracy                          0.933     27075
   macro avg      0.626     0.465     0.508     27075
weighted avg      0.918     0.933     0.919     27075

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.973     0.980     0.977     83568
         1.0      0.856     0.807     0.831     12057
         2.0      0.000     0.000     0.000         0

    accuracy                          0.958     95625
   macro avg      0.610     0.596     0.603     95625
weighted avg      0.959     0.958     0.958     95625



### SVM

In [80]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of Support Vector Machine")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of Support Vector Machine
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.922     0.999     0.959     24257
         1.0      0.791     0.167     0.275      2088
         2.0      0.000     0.000     0.000       377

    accuracy                          0.920     26722
   macro avg      0.571     0.388     0.411     26722
weighted avg      0.899     0.920     0.892     26722

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.960     0.989     0.975     83701
         1.0      0.909     0.722     0.805     12277

    accuracy                          0.955     95978
   macro avg      0.934     0.856     0.890     95978
weighted avg      0.954     0.955     0.953     95978



### RF

In [81]:
iteration = 100
yTestPoor, yPredictPoor = [], []
yTestBad, yPredictBad = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    permutation = np.random.permutation(features.shape[0])
    X = features[permutation, :]
    Y = dataLabel[permutation, :]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.3, random_state=0)
    clf =RandomForestClassifier(n_estimators=10)
    clf.fit(X_train, Y_train[:, 0])
    # 分割测试集为分别来自bad和poor的测试集
    X_test_poor, Y_test_poor, X_test_bad, Y_test_bad = [], [], [], []
    for idx in range(X_test.shape[0]):
        if Y_test[idx][2] == 1:
            X_test_poor.append(X_test[idx, :])
            Y_test_poor.append(Y_test[idx, :])
        else:
            X_test_bad.append(X_test[idx, :])
            Y_test_bad.append(Y_test[idx, :])
    # poor
    X_test_poor = np.array(X_test_poor)
    Y_test_poor = np.array(Y_test_poor)
    Y_test_poor_pred = clf.predict(X_test_poor)
    yTestPoor.extend(Y_test_poor[:, 0])
    yPredictPoor.extend(Y_test_poor_pred)
    # bad
    X_test_bad = np.array(X_test_bad)
    Y_test_bad = np.array(Y_test_bad)
    Y_test_bad_pred = clf.predict(X_test_bad)
    yTestBad.extend(Y_test_bad[:, 0])
    yPredictBad.extend(Y_test_bad_pred)

print("Results of RandomForest")
print("Test Results of Poor Road")
print(classification_report(yTestPoor, yPredictPoor, digits=3))
print("Test Results of Bad Road")
print(classification_report(yTestBad, yPredictBad, digits=3))

Results of RandomForest
Test Results of Poor Road
              precision    recall  f1-score   support

         0.0      0.928     0.998     0.962     24540
         1.0      0.788     0.240     0.368      2096
         2.0      0.333     0.003     0.005       383

    accuracy                          0.925     27019
   macro avg      0.683     0.413     0.445     27019
weighted avg      0.909     0.925     0.902     27019

Test Results of Bad Road
              precision    recall  f1-score   support

         0.0      0.971     0.984     0.977     83446
         1.0      0.881     0.797     0.837     12235
         2.0      0.000     0.000     0.000         0

    accuracy                          0.960     95681
   macro avg      0.617     0.594     0.605     95681
weighted avg      0.959     0.960     0.959     95681

