In [1]:
import numpy as np
import pandas as pd
import joblib
import warnings
import os
import collections
from itertools import zip_longest

from scipy.optimize import curve_fit
import GPy
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

from utils import cl_curve_smooth, curve_derivative

In [4]:
test_01 = joblib.load('./test_HA/test_HA_01.lz4')
test_02 = joblib.load('./test_HA/test_HA_02.lz4')
test_03 = joblib.load('./test_HA/test_HA_03.lz4')
test_04 = joblib.load('./test_HA/test_HA_04.lz4')
test_05 = joblib.load('./test_HA/test_HA_05.lz4')

train_01 = joblib.load('./train_HA/train_HA_01.lz4')
train_02 = joblib.load('./train_HA/train_HA_02.lz4')
train_03 = joblib.load('./train_HA/train_HA_03.lz4')

train_02 = train_02[train_02['CLI']>45]
train_03 = train_03[train_03['CLI']<140]

In [5]:

def CL_mean_initial(df, apply_col, seg_col, up_thred = 30, down_thred= 0, show_anomaly=True):
    '''
    描述：
        统计每一段时间的某一列均值->改成中位数，更稳
    '''
    data = df[apply_col]
    seg_cl = df[seg_col].unique()
    seg_cl.sort()
    
    
    # 初始化mean_list
    mean_list = []
    for cl in seg_cl:
        seg_mean = data[df[seg_col] == cl].median()
        mean_list.append(seg_mean)
    if show_anomaly:
        for i in range(len(mean_list)):
            idx_mean = mean_list[i]
            if (idx_mean>up_thred) or (idx_mean<down_thred):
                mean_list[i] = -1
    return seg_cl, mean_list

def CL_percent_delta_initial(df, apply_col, seg_col, up_box=90, down_box=10, up_thred=10, down_thred=0, show_anomaly = False):
    '''
    描述：
        初始化统计量 每一段时间的某一列变化范围(delta)，
        注意：如果需要进一步使用cl_curve_smooth函数平滑曲线的话， show_anomaly应该设置为False
    '''
    data = df[apply_col]
    seg_cl = df[seg_col].unique()
    seg_cl.sort()
    
    # 初始化delta_list
    delta_list = []
    for cl in seg_cl:
        tmp = data[df[seg_col] == cl]
        up_bound = np.percentile(tmp, up_box)
        down_bound = np.percentile(tmp, down_box)
        cl_delta = up_bound - down_bound
        delta_list.append(cl_delta)
    if show_anomaly:
        for i in range(len(delta_list)):
            idx_delta = delta_list[i]
            if (idx_delta>up_thred) or (idx_delta<down_thred):
                delta_list[i] = -1

    return seg_cl, delta_list

def gen_feature(df, has_label=False, feature_list=['PCA_T2__mean', 'PCA_T2__mean_diff', 'PCA_T2__delta', 'PCA_T2__delta_diff']):
    
    # mean， mean_diff
    cl, mean_list = CL_mean_initial(df, 'PCA_T2', 'CLI', 30, 0, show_anomaly=False)
    cl, mean_list, _ = cl_curve_smooth(mean_list, cl, up_thred=30, down_thred=0, confidence=0.5, fit_type='moving_robust_avg')
    
    cl, mean_list, fit_func = cl_curve_smooth(mean_list, cl, up_thred=30, down_thred=0, confidence=0.0, fit_type='scipy_curve_fit', return_cureve_func=True)
    mean_diff_list = curve_derivative(fit_func, cl, 1)
    
    cl, delta_list = CL_percent_delta_initial(df, 'PCA_T2', 'CLI', up_box=95, down_box=5, show_anomaly=False)
    cl, delta_list, _ = cl_curve_smooth(delta_list, cl, up_thred=30, down_thred=0, confidence=0.5, fit_type='moving_robust_avg')
    delta_list = pd.Series(delta_list).rolling(window=4, center=False, min_periods=1).mean().tolist()
    cl, delta_list, fit_func_dict = cl_curve_smooth(delta_list, cl, up_thred=10, down_thred=0, confidence=0.0, fit_type='scipy_curve_fit', return_cureve_func=True)
    delta_diff_list = curve_derivative(fit_func_dict, cl, 1)
    
    if has_label:
        rulr = df.groupby(by='CLI')['RULR'].mean().tolist()
        data = np.array([mean_list, mean_diff_list, delta_list, delta_diff_list, cl, rulr]).T
        result = pd.DataFrame(data=data, columns=feature_list + ['CL', 'RULR'])
    else:
        data = np.array([mean_list, mean_diff_list, delta_list, delta_diff_list, cl]).T
        result = pd.DataFrame(data=data, columns= feature_list + ['CL'])
    return result

In [6]:
train_list = [train_01, train_02, train_03]
test_list = [test_01, test_02, test_03, test_04, test_05]

# train
if not os.path.exists('./train_HA_feature'):
    os.mkdir('./train_HA_feature')
for i, train in enumerate(train_list):
    tmp = gen_feature(df=train, has_label=True)
    joblib.dump(tmp, './train_HA_feature/train_0%d.lz4'%(i+1), compress='lz4')
    
# test    
if not os.path.exists('./test_HA_feature'):
    os.mkdir('./test_HA_feature')
for i, test in enumerate(test_list):
    tmp = gen_feature(df=test, has_label=False)
    joblib.dump(tmp, './test_HA_feature/test_0%d.lz4'%(i+1), compress='lz4')