只保留plc的['RULR', 'csv_no', 'CL', 'CLI', 'spindle_load']与sensor进行拼接

In [1]:
import pandas as pd
import numpy as np
import joblib
from joblib import Parallel,delayed
from sklearn.preprocessing import scale
import os
import matplotlib.pyplot as plt

In [2]:
def concat_sensors(train_no, csv_nos):
    '''
    描述：
        合并所有的sensor文件
    '''
    input_dir = './sensors_tsfresh_minimal/%s/'%train_no
    def basis_func(idx):
        sensor = joblib.load(input_dir + '%d.lz4'%idx)
        return sensor
    sensors = Parallel(n_jobs=24, verbose=10)(delayed(basis_func)(i) for i in range(1,csv_nos+1))
    res = pd.concat(sensors, axis=0)
    res.reset_index(drop=True, inplace=True)
    return res

def concat_sensors_plc(train_no, csv_nos):
    '''
    描述：
        拼接plc和sensors
    '''
    input_dir = './train_plc_RULR/'
    # 拼接
    plc = joblib.load(input_dir+'train_%s_plc.lz4'%train_no)
    sensors = concat_sensors(train_no, csv_nos)
    
    tmp = pd.concat([plc[['RULR', 'csv_no', 'CL', 'CLI', 'spindle_load']],sensors], axis=1)
    return tmp 
    

def filter_inf(df):
    '''
    描述：
        过滤有inf值的样本
    '''
    inf_filter = np.sum(df.values == np.inf, axis=1)
    df = df[inf_filter==0]
    df.reset_index(drop=True, inplace=True)
    print('有%d行inf值'%sum(inf_filter))
    return df

def filter_nan(df):
    '''
    描述：
        过滤有nan值的样本
    注意：
        np.nan == np.nan  False
    '''
    nan_filter = df.isnull().sum(axis=1)>0
    df = df[nan_filter==0]
    df.reset_index(drop=True, inplace=True)
    print('有%d行nan值'%sum(nan_filter))
    return df


def write_file(train_no, csv_nos):
    '''
    描述：
        拼接文件，去除inf值，并输出文件
    '''
    if not os.path.exists('./concats'):
        os.mkdir('./concats')
    output_dir = './concats/'
    dev = concat_sensors_plc(train_no, csv_nos)
    dev = filter_inf(dev)
    dev = filter_nan(dev)
    joblib.dump(dev, output_dir + 'concat_%s.lz4'%train_no, compress='lz4')


In [3]:
write_file('01', 48)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   6 out of  48 | elapsed:    1.2s remaining:    8.3s
[Parallel(n_jobs=24)]: Done  11 out of  48 | elapsed:    1.3s remaining:    4.2s
[Parallel(n_jobs=24)]: Done  16 out of  48 | elapsed:    1.3s remaining:    2.6s
[Parallel(n_jobs=24)]: Done  21 out of  48 | elapsed:    1.3s remaining:    1.7s
[Parallel(n_jobs=24)]: Done  26 out of  48 | elapsed:    1.3s remaining:    1.1s
[Parallel(n_jobs=24)]: Done  31 out of  48 | elapsed:    1.4s remaining:    0.8s
[Parallel(n_jobs=24)]: Done  36 out of  48 | elapsed:    1.4s remaining:    0.5s
[Parallel(n_jobs=24)]: Done  41 out of  48 | elapsed:    1.4s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  46 out of  48 | elapsed:    1.5s remaining:    0.1s
[Parallel(n_jobs=24)]: Done  48 out of  48 | elapsed:    1.7s finished


有0行inf值
有0行nan值


In [4]:
write_file('02', 48)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Batch computation too fast (0.0612s.) Setting batch_size=6.
[Parallel(n_jobs=24)]: Done   6 out of  48 | elapsed:    0.1s remaining:    1.0s
[Parallel(n_jobs=24)]: Done  11 out of  48 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=24)]: Done  16 out of  48 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=24)]: Done  21 out of  48 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  26 out of  48 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  31 out of  48 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  36 out of  48 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=24)]: Done  41 out of  48 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=24)]: Done  46 out of  48 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=24)]: Done  48 out of  48 | elapsed:    0.3s finished


有0行inf值
有0行nan值


In [5]:
write_file('03', 37)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Batch computation too fast (0.0175s.) Setting batch_size=22.
[Parallel(n_jobs=24)]: Done   2 out of  37 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=24)]: Done   6 out of  37 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=24)]: Done  10 out of  37 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  14 out of  37 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=24)]: Done  18 out of  37 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=24)]: Done  22 out of  37 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=24)]: Done  26 out of  37 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=24)]: Done  30 out of  37 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=24)]: Done  34 out of  37 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=24)]: Done  37 out of  37 | elapsed:    0.2s finished


有0行inf值
有0行nan值


In [4]:
# debug code
# import joblib
# train = joblib.load('./concats/concat_01.lz4')
# train.shape

# plc_01 = joblib.load('./plc_features/train_01.lz4')
# plc_01.head()