In [3]:
import gc
import os
import sys
import numpy as np
import pandas as pd
from IPython.display import display

In [4]:
frequency = 25600
train_01 = "/home/shuzhilian/Notebook/Industry_Data/01-TrainingData-additional/"
test_01 = "/home/shuzhilian/Notebook/Industry_Data/02-TestingData-additional/"
final_dir = "03-FinalData-r1el"
output_dir = "/home/shuzhilian/Notebook/Industry_Data/Output/"

In [5]:
def outlier_value_deal(df, threshold=100):
    for column in df:
        if df[column].abs().max() < threshold:
            continue
        else:
            df[column] = df[column].apply(lambda x:x if abs(x)<threshold else np.nan)
            df = df.fillna(method="pad")
    return df

In [6]:
def sensor_data_preprocessing(train_dir_root, output_dir, output_name, name_range, start_time, total_life=None):
    array_list = list()
    lifetime_list = list()
    
    for i in name_range:
        tool_num = i+1
        sub_dir="%.2d"%tool_num
        path = os.path.join(train_dir_root, sub_dir, "Sensor")
        files = os.listdir(path)
        files_num = len(files)
        for j in range(files_num):
            csv_num = j+1
            file_path = os.path.join(path, "%d.csv"%csv_num)
            data = pd.read_csv(file_path)
            seconds = data.shape[0]//frequency
            data = data.iloc[:frequency*seconds]
            data = outlier_value_deal(data)
            reshape_data = data.values.reshape(-1, frequency, 4)
            array_list.append(reshape_data)
            used_time = start_time[i]+5*csv_num
            if total_life:
                lifetime_list.extend([[used_time, total_life[i]-used_time]]*seconds)
            else:
                lifetime_list.extend([[used_time]]*seconds)
            
    gc.collect()
    length_list = [item.shape[0] for item in array_list]
    features = np.memmap(os.path.join(output_dir, "data.dat"), dtype=np.float32, mode="w+", shape=(sum(length_list), frequency, 4))
    index = 0
    for idx, array in enumerate(array_list):
        features[index:index+length_list[idx]] = array
        index += length_list[idx]
    del(array_list)
    gc.collect()
    labels = np.array(lifetime_list)
    output_path = os.path.join(output_dir, output_name)
    np.savez(file=output_path, data=features,label=labels)

Preprocess data for **RNN/LSTM**

In [25]:
def sensor_data_preprocessing_seq(train_dir_root, output_dir, output_name, seq_len=5, seq_deal_method=None):
    array_list = list()
    lifetime_list = list()
    
    sub_dirs = os.listdir(train_dir_root)
    for num in range(1,4):
        sub_dir="%.2d"%num
        path = os.path.join(train_dir_root, sub_dir, "Sensor")
        files = os.listdir(path)
        files_num = len(files)
        total_lifetime = files_num*5
        for num in range(1,files_num+1):
            file_path = os.path.join(path, "%d.csv"%num)
            data = pd.read_csv(file_path)
            seconds = data.shape[0]//frequency
            samples = seconds//seq_len
            data.drop(labels=["vibration_2", "vibration_3", "current"], axis=1,inplace=True)
            data = data.iloc[:frequency*seq_len*samples]
            data = outlier_value_deal(data)
            reshape_data = data.values.reshape(samples, seq_len, frequency)
            if seq_deal_method:
                for m in range(samples):
                    for n in range(seq_len):
                        reshape_data[m,n]=seq_deal_method(reshape_data[m,n])      
            array_list.append(reshape_data)
            lifetime_list.extend([total_lifetime-5*num]*samples)
            
    gc.collect()
    length_list = [item.shape[0] for item in array_list]
    features = np.memmap(os.path.join(output_dir, "data.dat"), dtype=np.float32, mode="w+", shape=(sum(length_list), seq_len, frequency))
    index = 0
    for idx, array in enumerate(array_list):
        features[index:index+length_list[idx]] = array
        index += length_list[idx]
    del(array_list)
    gc.collect()
    labels = np.array(lifetime_list)
    output_path = os.path.join(output_dir, output_name)
    np.savez(file=output_path, data=features,label=labels)
    
def test_sensor_data_preprocessing_seq(test_dir_root, output_dir, output_name, seq_len=5, seq_deal_method=None):
    array_list = list()

    used_times = [40,70,50,70,120]
    total_life = [104,52,190,65,40]
    for i in range(1,6):
        sub_dir = "%.2d"%i
        path = os.path.join(test_dir_root, sub_dir, "Sensor")
        files = os.listdir(path)
        files_num = len(files)
        for j in range(0,files_num):
            num = j+1
            used_time = used_times[i-1] + 5*num
            file_path = os.path.join(path, "%d.csv"%num)
            data = pd.read_csv(file_path)
            seconds = data.shape[0]//frequency
            samples = seconds//seq_len
            data.drop(labels=["vibration_2", "vibration_3", "current"], axis=1,inplace=True)
            data = data.iloc[:frequency*seq_len*samples]
            data = outlier_value_deal(data)
            reshape_data = data.values.reshape(samples, seq_len, frequency)
            if seq_deal_method:
                for m in range(samples):
                    for n in range(seq_len):
                        reshape_data[m,n]=seq_deal_method(reshape_data[m,n]) 
            array_list.append(reshape_data)
            
    gc.collect()
    array = np.vstack(array_list)
    output_path = os.path.join(output_dir, output_name)
    np.savez(file=output_path, data=array)

In [7]:
%%time
sensor_data_preprocessing(train_01, output_dir, "01_train_a.npz", 
                          name_range=range(3), 
                          start_time=[0,0,0], total_life=[240,240,185])
gc.collect()

CPU times: user 12min 44s, sys: 20.5 s, total: 13min 4s
Wall time: 5min 19s


In [8]:
%%time 
sensor_data_preprocessing(test_01, output_dir, "02_test_a.npz",
                          name_range=range(5),
                          start_time=[40,70,50,70,120], 
                          total_life=[194,172,290,185,210])

CPU times: user 4min 24s, sys: 7.87 s, total: 4min 32s
Wall time: 1min 29s


In [None]:
%%time 
sensor_data_preprocessing(final_dir, output_dir, "03_final.npz",
                          name_range=range(4),
                          start_time=[40,70,50,70,120], 
                          total_life=None)

In [13]:
%%time
sensor_data_preprocessing_seq(train_01, output_dir, "01_l.npz", seq_deal_method=np.fft.fft)



CPU times: user 11min 2s, sys: 13.5 s, total: 11min 16s
Wall time: 5min 51s


In [26]:
%%time
test_sensor_data_preprocessing_seq(test_01, output_dir, "02_l.npz", seq_deal_method=np.fft.fft)



CPU times: user 3min 42s, sys: 3.49 s, total: 3min 46s
Wall time: 1min 17s
