<a href="https://colab.research.google.com/github/yrrSelena/RUL/blob/master/LSTM_30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing

Using TensorFlow backend.


In [0]:
# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

# 读取数据

## 设置常量参数

In [0]:
train_file = 'train_FD001.txt'
test_file = 'test_FD001.txt'
test_rul_file = 'RUL_FD001.txt'
n_ftrs = 26
n_units = 100
col_name = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
             's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
             's15', 's16', 's17', 's18', 's19', 's20', 's21']
#ftr_col_name = col_name[5:]

In [0]:
train_df = pd.read_csv('TrainData_ewm.csv')
test_df = pd.read_csv('TestData_ewm.csv')

In [6]:
train_df.head()

Unnamed: 0,RUL,cycle,cycle_norm,id,s11,s12,s13,s14,s15,s17,s2,s20,s21,s3,s4,s6,s7,s8,s9,setting1,setting2
0,191,1,0.0,1,0.425668,0.640978,0.231062,0.200888,0.446382,0.287032,0.152747,0.767506,0.768934,0.495953,0.307035,1.0,0.766807,0.361307,0.091366,0.45977,0.166667
1,190,2,0.00277,1,0.434339,0.741327,0.301665,0.177575,0.480236,0.287032,0.227676,0.730972,0.773978,0.530622,0.339481,1.0,0.689345,0.332918,0.085096,0.609195,0.25
2,189,3,0.00554,1,0.367477,0.790895,0.283604,0.174066,0.463207,0.171647,0.285511,0.696703,0.712003,0.496246,0.360049,1.0,0.711203,0.365079,0.102209,0.252874,0.75
3,188,4,0.00831,1,0.29835,0.857846,0.317044,0.173521,0.373086,0.209661,0.313856,0.654375,0.701137,0.428461,0.352301,1.0,0.736368,0.406349,0.104466,0.54023,0.5
4,187,5,0.01108,1,0.291904,0.840364,0.307101,0.173151,0.408349,0.273072,0.332788,0.636001,0.712244,0.389367,0.376776,1.0,0.721245,0.393418,0.114538,0.390805,0.333333


# 生成样本特征

In [0]:
sequence_length = 30 #设置时间窗口大小
sequence_cols = list(train_df.columns.difference(['RUL','cycle','id'])) #设置特征(18)

In [16]:
len(sequence_cols)

18

In [0]:
# pick a large window size of 50 cycles

# function to reshape features into (samples, time steps, features) 
# 生成特征样本
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones 
    只有比时间窗口大的序列会被提取特征，若测试样本的时间序列过小，则被忽视。
    另一种方法是对序列进行填充
    """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values # 获得对应列的值
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,142),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 141 191 -> from row 141 to 191
    
    # zip将数据打包为元组的列表
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :] #yield 相当于返回
        

In [0]:
# TODO for debug 
# 样例测试
# val is a list of 192 - 50 = 142 bi-dimensional array (50 rows x 18 columns)
val=list(gen_sequence(train_df[train_df['id']==1], sequence_length, sequence_cols))
print(len(val))

142


In [0]:
# 对每个发动机生成训练样本
# generator for the sequences
# transform each id of the train dataset in a sequence
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

In [23]:
# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
print(seq_array.shape)

(17631, 30, 18)


In [24]:
seq_array = seq_array.reshape((seq_array.shape[0],-1))
seq_array.shape

(17631, 540)

# 生成样本标签

In [0]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]] 
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previus ones are discarded).
    # All the next id's sequences will have associated step by step one label as target.
    return data_matrix[seq_length:num_elements, :]

In [26]:
# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['RUL']) 
             for id in train_df['id'].unique()]

label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

(17631, 1)

In [0]:
# We pick the last sequence for each id in the test data
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last = seq_array_test_last.reshape((seq_array_test_last.shape[0],-1))
#print("seq_array_test_last")
#print(seq_array_test_last)
#print(seq_array_test_last.shape)

# Similarly, we pick the labels
#print("y_mask")
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test_last = test_df.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)

In [0]:
from sklearn.model_selection import cross_validate,learning_curve,GridSearchCV
import time

In [0]:
X_train = seq_array
y_train = label_array.ravel()
X_test = seq_array_test_last
y_test = label_array_test_last.ravel()

{'C': [0.1, 1, 5, 10], 'gamma': [0.1, 0.05, 0.01]}
