In [38]:
from siml.sk_utils import *
from siml.signal_analysis_utils import *
import numpy as np
import matplotlib.pyplot as plt
import warnings
import scipy.stats
import pandas as pd
from collections import defaultdict, Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import IsolationForest
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sklearn.metrics import classification_report
from detecta import detect_peaks
from scipy.fftpack import fft
from scipy.signal import welch
from sklearn.model_selection import train_test_split

## 加载数据文件

In [39]:
typeDescription = {
    0: 'normal',
    1: 'pothole',
    2: 'transverse',
}


def readFile(filename):
    return np.loadtxt(filename)

In [40]:
folderPath = '../../data/Final_Version/poor/datasets/'
dataFile = ['dataX.txt', 'dataY.txt', 'dataZ.txt']
labelFile = ['dataLabel.txt']

signals = []
for file in dataFile:
    dataPath = folderPath+file
    signals.append(np.loadtxt(dataPath))
signals = np.transpose(np.array(signals), (1, 2, 0))
print('数据集：', signals.shape)

labelFilePath = folderPath+labelFile[0]
dataLabel = np.loadtxt(labelFilePath)
anomalyType = list(dataLabel[:, 0])

dic = {}
temp = Counter(anomalyType)
for key in temp.keys():
    dic[typeDescription[key]] = temp[key]
print(dic)

数据集： (899, 64, 3)
{'normal': 817, 'pothole': 69, 'transverse': 13}


## 特征提取

In [41]:
N = 64 #样本数
f_s = 50 #采样频率
denominator = 10

In [42]:
#FFT
def get_fft_values(y_values, N, f_s):
    f_values = np.linspace(0.0, f_s/2.0, N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return fft_values

#PSD
def get_psd_values(y_values, N, f_s):
    f_values, psd_values = welch(y_values, fs=f_s)
    return  psd_values

#Autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[len(result)//2:]
 
def get_autocorr_values(y_values, N, f_s):
    autocorr_values = autocorr(y_values)
    x_values = np.array([ 1.0*jj/f_s for jj in range(0, N)])
    return autocorr_values


In [43]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    # print(counter_values)
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    # print(probabilities)
    entropy = scipy.stats.entropy(probabilities)
    return entropy


def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]


def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(
        np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]


def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics


def extract_features(dataset, N, f_s):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0, dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            features += get_features(get_fft_values(signal, N, f_s))
            features += get_features(get_psd_values(signal, N, f_s))
            features += get_features(get_autocorr_values(signal, N, f_s))
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    return X

In [44]:
features = extract_features(
    signals, N, f_s)

In [45]:
print(features.shape)
labels = np.array(anomalyType)

(899, 108)


In [46]:
StandardScaler().fit_transform(features)
Normalizer().fit_transform(features)

array([[0.0358157 , 0.        , 0.12401073, ..., 0.08874499, 0.76209604,
        0.05263877],
       [0.03766885, 0.        , 0.10868932, ..., 0.0828915 , 0.63216888,
        0.04230999],
       [0.04766855, 0.        , 0.11003388, ..., 0.07136573, 0.37029087,
        0.03298544],
       ...,
       [0.0196663 , 0.        , 0.04539596, ..., 0.07192155, 0.91157172,
        0.03447181],
       [0.04231993, 0.        , 0.1709533 , ..., 0.08888996, 0.64707703,
        0.06150929],
       [0.04368873, 0.        , 0.12605902, ..., 0.08366314, 0.55525747,
        0.04820655]])

## 信号分类

In [47]:
def randomize(features, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_features = features[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_features, shuffled_labels

iteration = 100
warnings.filterwarnings('ignore')

### Logistic Regression

In [48]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features,labels)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Logistic Regression")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict,digits=3))

Results of Logistic Regression
Accuracy on training set is : 0.943370429252782
Accuracy on test set is : 0.9267777777777779
              precision    recall  f1-score   support

         0.0      0.950     0.982     0.966     24524
         1.0      0.611     0.435     0.509      2099
         2.0      0.176     0.069     0.099       377

    accuracy                          0.927     27000
   macro avg      0.579     0.495     0.524     27000
weighted avg      0.913     0.927     0.918     27000



### Support Vector Machine

In [49]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features,labels)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    clf = svm.SVC()
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of Support Vector Machine")  
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict,digits=3))

Results of Support Vector Machine
Accuracy on training set is : 0.9315739268680442
Accuracy on test set is : 0.9307407407407411
              precision    recall  f1-score   support

         0.0      0.937     0.997     0.966     24557
         1.0      0.753     0.320     0.449      2053
         2.0      0.000     0.000     0.000       390

    accuracy                          0.931     27000
   macro avg      0.563     0.439     0.472     27000
weighted avg      0.909     0.931     0.912     27000



### Random Forest

In [50]:
yTest, yPredict = [], []
trainingScore, testingScore = 0, 0
for i in range(iteration):
    X, Y = randomize(features,labels)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, Y_train)
    Y_test_pred = clf.predict(X_test)
    yTest.extend(Y_test)
    yPredict.extend(Y_test_pred)
    trainingScore += clf.score(X_train, Y_train)
    testingScore += clf.score(X_test, Y_test)
print("Results of RandomForest")
print("Accuracy on training set is : {}".format(trainingScore/iteration))
print("Accuracy on test set is : {}".format(testingScore/iteration))
print(classification_report(yTest, yPredict,digits=3))

Results of RandomForest
Accuracy on training set is : 0.9999523052464228
Accuracy on test set is : 0.9363703703703709
              precision    recall  f1-score   support

         0.0      0.952     0.988     0.970     24553
         1.0      0.673     0.505     0.577      2023
         2.0      0.000     0.000     0.000       424

    accuracy                          0.936     27000
   macro avg      0.542     0.498     0.516     27000
weighted avg      0.917     0.936     0.925     27000

