使用滑动标准差作为置信边带，找到并填充异常值为滑动均值

In [1]:
import joblib
import pandas as pd
import numpy as np
import os
import collections
from itertools import zip_longest
from joblib import Parallel, delayed
# import matplotlib.pyplot as plt
# %matplotlib inline

In [2]:
def moving_average(data, window_size, window_type='rectang'):
    '''
    描述：
        使用不同的窗口类型，获取滑动均值序列
    参数:
        window_type: triang or rectang
    '''
    if window_type == 'triang':
        window = np.linspace(0, 2/(window_size+1), window_size+1)[1:]
    elif window_type == 'rectang':
        window = np.ones(int(window_size))/float(window_size)
    else:
        raise ValueError('window_type error!')
        
    return np.convolve(data, window, 'same')

def df_moving_average(df, window_size, window_type='rectang', columns=None):
    '''
    描述：
        使用滑动平均平滑序列
    '''
    if columns is None:
        columns = ['Current_1', 'Current_2' ,'Current_3']
    
    for col in columns:
        df[col] = moving_average(df[col], window_size, window_type)
    return df

def impute_anomalies_rolling_std(y, window_size, sigma=1.0):
    '''
    描述：
        使用triang窗取均值，rectang窗提取方差，检测并使用均值填充异常点
    '''
    avg = moving_average(y, window_size,window_type= 'triang')
    avg_list = avg.tolist()
    residual = y - moving_average(y, window_size, 'rectang')
    # Calculate the variation in the distribution of the residual
    testing_std = pd.Series(residual).rolling(window=window_size, min_periods=1, center=False).std()
    rolling_std = testing_std.replace(np.nan,testing_std.iloc[window_size-1]).values
    
    up_bound = avg + (sigma * rolling_std)
    down_bound = avg - (sigma * rolling_std)
    f = ((y>up_bound)|(y<down_bound)).values
    y[f] = avg[f]
    return y

def df_impute_anomalies_rolling_std(df, window_size, sigma, columns=None):
    if columns is None:
        columns = ['Vibration_1', 'Vibration_2', 'Vibration_3']
    for col in columns:
        df[col] = impute_anomalies_rolling_std(df[col], window_size, sigma)
    return df

In [3]:
def optfunc_parallel(data_no, csv_nos, opt_func_list):
    '''
    描述：
        并行处理数据
    参数：
        data_no：第几个plc
        csv_nos：plc对应的sensor文件个数
    '''
    
    input_dir = './sensors_clean/0%d/'%data_no
    output_dir = './sensors_ad/0%d/'%data_no

    if not os.path.exists('./sensors_ad/'):
        os.mkdir('./sensors_ad')
    if not os.path.exists('./sensors_ad/0%d'%data_no):
        os.mkdir('./sensors_ad/0%d'%data_no)
    
    def basis_func(idx):
        sensor = joblib.load(input_dir + '%d.lz4'%idx)
#         sensor.columns = ['Vibration_1', 'Vibration_2', 'Vibration_3','Current_1', 'Current_2', 'Current_3', 'id', 'sort_col']
        df_1 = opt_func_list[0](df = sensor, window_size = 15, window_type='rectang', columns = ['Current_1'])
        df_2 = opt_func_list[1](df = df_1, window_size = 15, sigma=2.0, columns=['Vibration_1', 'Vibration_2', 'Vibration_3'])
        joblib.dump(df_2, output_dir+'%d.lz4'%idx, compress='lz4')

    Parallel(n_jobs=48,verbose=10)(delayed(basis_func)(i) for i in range(1,csv_nos+1))

In [4]:
optfunc_parallel(1, 48, [df_moving_average, df_impute_anomalies_rolling_std])

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  48 | elapsed:   38.3s remaining:  9.6min
[Parallel(n_jobs=48)]: Done   8 out of  48 | elapsed:   53.1s remaining:  4.4min
[Parallel(n_jobs=48)]: Done  13 out of  48 | elapsed:  1.4min remaining:  3.7min
[Parallel(n_jobs=48)]: Done  18 out of  48 | elapsed:  1.5min remaining:  2.5min
[Parallel(n_jobs=48)]: Done  23 out of  48 | elapsed:  1.5min remaining:  1.6min
[Parallel(n_jobs=48)]: Done  28 out of  48 | elapsed:  1.5min remaining:  1.1min
[Parallel(n_jobs=48)]: Done  33 out of  48 | elapsed:  1.5min remaining:   41.9s
[Parallel(n_jobs=48)]: Done  38 out of  48 | elapsed:  1.6min remaining:   24.5s
[Parallel(n_jobs=48)]: Done  43 out of  48 | elapsed:  1.6min remaining:   10.9s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:  1.6min finished


In [5]:
optfunc_parallel(2, 48, [df_moving_average, df_impute_anomalies_rolling_std])

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  48 | elapsed:   15.2s remaining:  3.8min
[Parallel(n_jobs=48)]: Done   8 out of  48 | elapsed:   17.0s remaining:  1.4min
[Parallel(n_jobs=48)]: Done  13 out of  48 | elapsed:   17.9s remaining:   48.1s
[Parallel(n_jobs=48)]: Done  18 out of  48 | elapsed:   22.6s remaining:   37.6s
[Parallel(n_jobs=48)]: Done  23 out of  48 | elapsed:   26.4s remaining:   28.6s
[Parallel(n_jobs=48)]: Done  28 out of  48 | elapsed:   55.4s remaining:   39.6s
[Parallel(n_jobs=48)]: Done  33 out of  48 | elapsed:  1.1min remaining:   28.8s
[Parallel(n_jobs=48)]: Done  38 out of  48 | elapsed:  1.1min remaining:   17.2s
[Parallel(n_jobs=48)]: Done  43 out of  48 | elapsed:  1.1min remaining:    7.6s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:  1.1min finished


In [6]:
optfunc_parallel(3, 37, [df_moving_average, df_impute_anomalies_rolling_std])

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   2 out of  37 | elapsed:   28.4s remaining:  8.3min
[Parallel(n_jobs=48)]: Done   6 out of  37 | elapsed:   57.5s remaining:  5.0min
[Parallel(n_jobs=48)]: Done  10 out of  37 | elapsed:  1.1min remaining:  3.0min
[Parallel(n_jobs=48)]: Done  14 out of  37 | elapsed:  1.2min remaining:  1.9min
[Parallel(n_jobs=48)]: Done  18 out of  37 | elapsed:  1.2min remaining:  1.3min
[Parallel(n_jobs=48)]: Done  22 out of  37 | elapsed:  1.2min remaining:   48.9s
[Parallel(n_jobs=48)]: Done  26 out of  37 | elapsed:  1.2min remaining:   30.6s
[Parallel(n_jobs=48)]: Done  30 out of  37 | elapsed:  1.2min remaining:   17.0s
[Parallel(n_jobs=48)]: Done  34 out of  37 | elapsed:  1.2min remaining:    6.5s
[Parallel(n_jobs=48)]: Done  37 out of  37 | elapsed:  1.2min finished
