给train_plc添加id和sort_col,因为tsfresh提取特征需要使用id和sort_col

In [1]:
import pandas as pd
import numpy as np
import joblib
import datetime as dt
import os

from joblib import Parallel,delayed

In [2]:
def assign_id_single_sensor(sensor_seg_num, sensor):
    '''
    描述:
        为一个sensor文件增加一列id（id从1到sensor_seg_num）
    参数：
        sensor_seg_num(int):采样数据的分段数
        sensor(DataFrame):采样数据
    '''
    sensor_len = sensor.shape[0]
    sensor_seg_len = int(sensor_len / sensor_seg_num)
    sensor_res = sensor_len % sensor_seg_num
    # create column id
    a = np.ones(sensor_seg_len + 1)
    b = np.ones(sensor_seg_len)
    a_dup = a.reshape(-1,a.shape[0]).repeat(sensor_res,axis=0)
    A = a_dup*np.arange(1,sensor_res+1).reshape(-1,1)
    A = A.ravel()
    b_dup = b.reshape(-1,b.shape[0]).repeat((sensor_seg_num - sensor_res),axis=0)
    B = b_dup*np.arange(sensor_res+1,sensor_seg_num+1).reshape(-1,1)
    B = B.ravel()
    C = np.hstack((A,B)).astype(int)
    sensor['id'] = C
    return sensor

def assign_sort_col(df):
    '''
    描述：
        为df文件添加一列‘sort_col’
    '''
    df['sort_col'] = df.index
    return df

In [3]:
def assign_id_all_sensor(data_no):
    '''
    描述：
        
    参数：
        data_no（int）: 例如：1
        
    '''
    plc_path = './train_plc_RULR/train_0%s_plc.lz4'%data_no
    sensor_input_dir = './sensors/0%s/'%data_no
    sensor_output_folder = './sensors_id_sort/0%s'%data_no

    if not os.path.exists(sensor_output_folder):
        os.makedirs(sensor_output_folder)
    # 读入plc以获取plc对应csv_no数量   
    data_plc = joblib.load(plc_path)
    plc_sample_points = data_plc['csv_no'].value_counts().sort_index() # index代表第几个sensor文件
    
    def assign_id_basis_func(idx):
        input_path = os.path.join(sensor_input_dir, '%d.lz4'%idx)
        sensor = joblib.load(input_path)
        tmp = assign_id_single_sensor(plc_sample_points[idx], sensor)
        tmp = assign_sort_col(tmp)
        output_path = os.path.join(sensor_output_folder, '%d.lz4'%idx)
        joblib.dump(tmp, output_path, compress='lz4')

    Parallel(n_jobs=len(plc_sample_points.index), verbose=10)(delayed(assign_id_basis_func)(idx) for idx in plc_sample_points.index)


In [4]:
assign_id_all_sensor(1)
assign_id_all_sensor(2)
assign_id_all_sensor(3)

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   3 out of  48 | elapsed:   30.6s remaining:  7.6min
[Parallel(n_jobs=48)]: Done   8 out of  48 | elapsed:   41.5s remaining:  3.5min
[Parallel(n_jobs=48)]: Done  13 out of  48 | elapsed:   44.1s remaining:  2.0min
[Parallel(n_jobs=48)]: Done  18 out of  48 | elapsed:   46.0s remaining:  1.3min
[Parallel(n_jobs=48)]: Done  23 out of  48 | elapsed:   48.0s remaining:   52.2s
[Parallel(n_jobs=48)]: Done  28 out of  48 | elapsed:   48.5s remaining:   34.7s
[Parallel(n_jobs=48)]: Done  33 out of  48 | elapsed:   49.1s remaining:   22.3s
[Parallel(n_jobs=48)]: Done  38 out of  48 | elapsed:   49.6s remaining:   13.0s
[Parallel(n_jobs=48)]: Done  43 out of  48 | elapsed:   50.4s remaining:    5.9s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:   52.5s remaining:    0.0s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:   52.5s finished
[Parallel(n_jobs=48)]: Using backend LokyB