填充nans、infs和naive_outlier

In [1]:
import pandas as pd
import numpy as np
import joblib
from tsfresh.utilities.dataframe_functions import impute
import os
from joblib import Parallel, delayed

In [2]:
# def load_sensor(data_no, idx):
#     sensor = joblib.load('./sensors_id_sort/0%d/%d.lz4'%(data_no, idx))
#     return sensor

def gen_boxplot_bound(ser):
    '''
    描述：
        输出箱线图的异常值边界,输出上下边界以及fillna()之后的结果
    '''
    ser = ser.fillna(method='pad')
    tmp = ser.values.ravel()
    q1 = np.percentile(tmp, 0.1)
    q3 = np.percentile(tmp, 99.9)
    iqr = q3 -q1
    up_bound = q3 + 10*iqr
    down_bound = q1 - 10*iqr
    return up_bound, down_bound, ser
    
def naive_outliers_impute(df):
    '''
    描述：
        naive_outliers->up_bound, down_bound
    '''

    for col in df.columns:
        up_bound, down_bound, ser = gen_boxplot_bound(df[col])
        
        df.loc[ser<down_bound, col] = down_bound
        df.loc[ser>up_bound, col] = up_bound
    return df

In [3]:
def clean_sensor(df):
    '''
    描述：
        填充nans->median, infs->minmax, naive_outliers->up_bound, down_bound
    '''
    df_1 = impute(df)
    df_2 = naive_outliers_impute(df_1)
    return df_2

def clean_sensor_parallel(train_no, csv_nos, opt_func):
    '''
    描述：
        并行清洗数据
    参数：
        train_no：第几个plc
        csv_nos：plc对应的sensor文件个数
    '''
    
    input_dir = './sensors_id_sort/0%d/'%train_no
    output_dir = './sensors_clean/0%d/'%train_no

    if not os.path.exists('./sensors_clean/'):
        os.mkdir('./sensors_clean')
    if not os.path.exists('./sensors_clean/0%d'%train_no):
        os.mkdir('./sensors_clean/0%d'%train_no)
    
    def basis_func(idx):
        sensor = joblib.load(input_dir + '%d.lz4'%idx)
        tmp = opt_func(sensor)
        joblib.dump(tmp, output_dir+'%d.lz4'%idx, compress='lz4')

    Parallel(n_jobs=48,verbose=1)(delayed(basis_func)(i) for i in range(1,csv_nos+1))

In [4]:
clean_sensor_parallel(1, 10, clean_sensor)
clean_sensor_parallel(2, 10, clean_sensor)
clean_sensor_parallel(3, 10, clean_sensor)
clean_sensor_parallel(4, 10, clean_sensor)
clean_sensor_parallel(5, 10, clean_sensor)

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  10 | elapsed:   21.0s remaining:   48.9s
[Parallel(n_jobs=48)]: Done  10 out of  10 | elapsed:   26.1s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  10 | elapsed:   24.7s remaining:   57.7s
[Parallel(n_jobs=48)]: Done  10 out of  10 | elapsed:   27.8s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  10 | elapsed:   22.5s remaining:   52.5s
[Parallel(n_jobs=48)]: Done  10 out of  10 | elapsed:   26.5s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  10 | elapsed:   22.6s remaining:   52.7s
[Parallel(n_jobs=48)]: Done  10 out of  10 | elapsed:   28.4s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parall