In [1]:
import joblib 
from joblib import Parallel, delayed
from pyentrp import entropy as ent
import pandas as pd
import numpy as np
import os 

In [2]:
def calc_ent_basis(ser, col):
    """
        描述：
            计算信息熵的方法
    """

    uniq_value_ser = ser.value_counts()
    ser_len = len(ser)
    ent = 0.0
    for val_cnt in uniq_value_ser:
        p = val_cnt / ser_len
        logp = np.log2(p)
        ent -= p * logp
        print(ent)
    return {col: ent}
def calc_ent_parallel(df):
    '''
    描述：
        并行计算
    '''
    ent_list = Parallel(n_jobs=24,verbose=1)(delayed(calc_ent_basis)(df.loc[:,col], col) for col in df.columns)
    ent_dict = {}
    for item in ent_list:
        ent_dict.update(item)
    return ent_dict

def ent_filter(df, ent_thred=10):
    '''
    描述：
        过滤信息熵小于ent_thred的变量
    '''
    ent_dict = calc_ent_parallel(df)
    ent_ser = pd.Series(ent_dict)
    low_ent_cols = list(ent_ser[ent_ser <= ent_thred].index)
    ent_cols = [col for col in df.columns if col not in low_ent_cols]
    return df[ent_cols], ent_dict


def calc_permutation_ent_basis(order ,ser, col):
    '''
    描述：
        计算序列order阶排序熵
    '''
    pe = ent.permutation_entropy(ser.values, order=order, delay=20000, normalize=True)
    return {col: pe}

def calc_permutation_ent_parallel(df):
    '''
    描述：
        并行计算排序熵
    '''
    pe_list = Parallel(n_jobs=48,verbose=1)(delayed(calc_permutation_ent_basis)(2, df.loc[:,col], col) for col in df.columns)
    pe_dict = {}
    for item in pe_list:
        pe_dict.update(item)
    return pe_dict

def pe_filter(df, pe_thred=0.999):
    '''
    描述：
        过滤排列熵大于等于阈值的变量
    '''
    
    pe_dict = calc_permutation_ent_parallel(df)
    pe_ser = pd.Series(pe_dict)
    high_pe_cols = list(pe_ser[pe_ser >= pe_thred].index)
    pe_cols = [col for col in df.columns if col not in high_pe_cols]
    return df[pe_cols], pe_dict

In [3]:
def load_train(idx):
    '''
    描述：
        加载train, 返回x_train, y_train, 格式为DataFrame
    '''
    input_dir = '../concats_cut/'
    train = joblib.load(input_dir + 'concat_0%d.lz4'%idx)
    y_train = train[['RULR']]
    x_train = train.drop(columns=['RULR'])
    return x_train, y_train
   

In [4]:
if not os.path.exists('./train_data'):
    os.mkdir('./train_data')
x_train_01, y_train_01 = load_train(1)
x_train_01_ent, ent_dict_01 = ent_filter(x_train_01, 1)
x_train_01_pe, pe_dict_01 = pe_filter(x_train_01_ent, 1)
# train_01 = pd.concat([x_train_01_pe,y_train_01], axis=1)
# joblib.dump(train_01, './train_data/train_01.lz4', compress='lz4')

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    3.5s remaining:    0.8s
[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    3.7s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.4s remaining:    0.3s
[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.6s finished


In [6]:
x_train_01.columns

Index(['CL', 'CLI', 'spindle_load', 'Current_1__abs_energy',
       'Current_1__kurtosis', 'Current_1__length', 'Current_1__mean',
       'Current_1__mean_abs_change', 'Current_1__mean_change',
       'Current_1__skewness', 'Current_1__variance', 'Vibration_1__abs_energy',
       'Vibration_1__kurtosis', 'Vibration_1__length', 'Vibration_1__mean',
       'Vibration_1__mean_abs_change', 'Vibration_1__mean_change',
       'Vibration_1__skewness', 'Vibration_1__variance',
       'Vibration_2__abs_energy', 'Vibration_2__kurtosis',
       'Vibration_2__length', 'Vibration_2__mean',
       'Vibration_2__mean_abs_change', 'Vibration_2__mean_change',
       'Vibration_2__skewness', 'Vibration_2__variance',
       'Vibration_3__abs_energy', 'Vibration_3__kurtosis',
       'Vibration_3__length', 'Vibration_3__mean',
       'Vibration_3__mean_abs_change', 'Vibration_3__mean_change',
       'Vibration_3__skewness', 'Vibration_3__variance',
       'Current_1__abs_energy_scale', 'Vibration_1__abs_en

In [7]:
x_train_02, y_train_02 = load_train(2)
x_train_02_ent, ent_dict_02 = ent_filter(x_train_02, 1)
x_train_02_pe, pe_dict_02 = pe_filter(x_train_02_ent, 1)
# train_02 = pd.concat([x_train_02_pe,y_train_02], axis=1)
# joblib.dump(train_02, './train_data/train_02.lz4', compress='lz4')

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    2.1s remaining:    0.5s
[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    2.4s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.5s finished


In [8]:
x_train_03, y_train_03 = load_train(3)
x_train_03_ent, ent_dict_03 = ent_filter(x_train_03, 1)
x_train_03_pe, pe_dict_03 = pe_filter(x_train_03_ent, 1)
# train_03 = pd.concat([x_train_03_pe,y_train_03], axis=1)
# joblib.dump(train_03, './train_data/train_03.lz4', compress='lz4')

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done  32 out of  39 | elapsed:    2.3s remaining:    0.5s
[Parallel(n_jobs=24)]: Done  39 out of  39 | elapsed:    2.5s finished
[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done  24 out of  39 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=48)]: Done  39 out of  39 | elapsed:    0.4s finished


In [9]:
# 根据信息熵过滤的特征集合是一致的
(list(x_train_01_pe.columns) == list(x_train_02_pe.columns)) & (list(x_train_02_pe.columns) == list(x_train_03_pe.columns))

True

In [10]:
use_cols = list(set(list(x_train_01_pe.columns) + list(x_train_02_pe.columns) + list(x_train_03_pe.columns)))

In [11]:
x_train_01, y_train_01 = load_train(1)
train_01 = pd.concat([x_train_01[use_cols],y_train_01], axis=1)
joblib.dump(train_01, './train_data/train_01.lz4', compress='lz4')

['./train_data/train_01.lz4']

In [12]:
x_train_02, y_train_02 = load_train(2)
train_02 = pd.concat([x_train_02[use_cols],y_train_02], axis=1)
joblib.dump(train_02, './train_data/train_02.lz4', compress='lz4')

['./train_data/train_02.lz4']

In [13]:
x_train_03, y_train_03 = load_train(3)
train_03 = pd.concat([x_train_03[use_cols],y_train_03], axis=1)
joblib.dump(train_03, './train_data/train_03.lz4', compress='lz4')

['./train_data/train_03.lz4']

In [14]:
# 根据信息熵过滤的特征集合是一致的
(list(train_01.columns) == list(train_02.columns)) & (list(train_01.columns) == list(train_03.columns))

True

In [15]:
train_03.shape

(75039, 40)