<a href="https://colab.research.google.com/github/yrrSelena/RUL/blob/master/RUL_LSTM_CMAPSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 载入库

In [1]:
import keras
import keras.backend as K
from keras.layers.core import Activation
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn import preprocessing

Using TensorFlow backend.


In [0]:
# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

# define path to save model
model_path = 'LSTM_cmpass_model.h5'

# 读取数据

## 设置常量参数

In [0]:
train_file = 'train_FD001.txt'
test_file = 'test_FD001.txt'
test_rul_file = 'RUL_FD001.txt'
n_ftrs = 26
n_units = 100
col_name = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
             's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
             's15', 's16', 's17', 's18', 's19', 's20', 's21']
#ftr_col_name = col_name[5:]

## 载入数据

In [0]:
# read training data - It is the aircraft engine run-to-failure data.
train_df = pd.read_csv(train_file,names = col_name, header=None,delim_whitespace=True)
test_df = pd.read_csv(test_file,names = col_name, header=None,delim_whitespace=True)
rul_df = pd.read_csv(test_rul_file, names=['RUL'],header=None,delim_whitespace=True) 

# 数据预处理

## 训练数据

### 计算RUL

In [0]:
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index() #添加一列作为索引
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

### MinMax归一化处理

In [0]:
# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL'])

# 对数据进行min-max归一化处理[0,1]
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

### 特征筛选

In [42]:
t = train_df.max()==0
rebundant_ftr_name = t[t==True].index.tolist()
selected_ftr_name = train_df.columns.difference(rebundant_ftr_name)
selected_ftr_name

Index(['RUL', 'cycle', 'cycle_norm', 'id', 's11', 's12', 's13', 's14', 's15',
       's17', 's2', 's20', 's21', 's3', 's4', 's6', 's7', 's8', 's9',
       'setting1', 'setting2'],
      dtype='object')

In [0]:
train_df =train_df.loc[:,selected_ftr_name]

## 测试数据

### MinMax归一化处理

In [0]:
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)

### 计算测试数据的RUL

In [0]:
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index() 
rul.columns = ['id', 'max']
rul_df.columns = ['more']
rul_df['id'] = rul_df.index + 1 # 获得每个发动机的编号
rul_df['max'] = rul['max'] + rul_df['more'] # 测试数据的生命周期=发动机的运行时间+剩余寿命
rul_df.drop('more', axis=1, inplace=True)

# generate RUL for test data
test_df = test_df.merge(rul_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)


In [46]:
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,cycle_norm,RUL
0,1,1,0.632184,0.75,0.0,0.0,0.545181,0.310661,0.269413,0.0,1.0,0.652174,0.212121,0.127614,0.0,0.208333,0.646055,0.220588,0.13216,0.308965,0.0,0.333333,0.0,0.0,0.55814,0.661834,0.0,142
1,1,2,0.344828,0.25,0.0,0.0,0.150602,0.379551,0.222316,0.0,1.0,0.805153,0.166667,0.146684,0.0,0.386905,0.739872,0.264706,0.204768,0.213159,0.0,0.416667,0.0,0.0,0.682171,0.686827,0.00277,141
2,1,3,0.517241,0.583333,0.0,0.0,0.376506,0.346632,0.322248,0.0,1.0,0.68599,0.227273,0.158081,0.0,0.386905,0.69936,0.220588,0.15564,0.458638,0.0,0.416667,0.0,0.0,0.728682,0.721348,0.00554,140
3,1,4,0.741379,0.5,0.0,0.0,0.370482,0.285154,0.408001,0.0,1.0,0.679549,0.19697,0.105717,0.0,0.255952,0.573561,0.25,0.17009,0.257022,0.0,0.25,0.0,0.0,0.666667,0.66211,0.00831,139
4,1,5,0.58046,0.5,0.0,0.0,0.391566,0.352082,0.332039,0.0,1.0,0.694042,0.166667,0.102396,0.0,0.27381,0.73774,0.220588,0.152751,0.300885,0.0,0.166667,0.0,0.0,0.658915,0.716377,0.01108,138


In [0]:
test_df =test_df.loc[:,selected_ftr_name]

# 生成样本特征

In [0]:
sequence_length = 50 #设置时间窗口大小

In [0]:
# pick a large window size of 50 cycles

# function to reshape features into (samples, time steps, features) 
# 生成特征样本
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones 
    只有比时间窗口大的序列会被提取特征，若测试样本的时间序列过小，则被忽视。
    另一种方法是对序列进行填充
    """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values # 获得对应列的值
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,142),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 141 191 -> from row 141 to 191
    
    # zip将数据打包为元组的列表
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :] #yield 相当于返回
        