# Loading the data

Data source <a href="https://c3.nasa.gov/dashlink/resources/139/">here.</a> 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
np.random.seed(8)

In [4]:
# read training data (check the documentation of the dataset for more information about the features)
cols = ['unit', 'cycle', 'os1', 'os2', 'os3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21', 's26', 's27']

data = pd.read_csv('data/data.txt', sep=" ", header=None, names=cols)
data.head(5)

Unnamed: 0,unit,cycle,os1,os2,os3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,s26,s27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,,


In [5]:
data.drop(['s26', 's27'], axis=1, inplace=True)

# Label construction: Remaining Useful Life

In [6]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(data.groupby('unit')['cycle'].max()).reset_index()
rul.head(5)

Unnamed: 0,unit,cycle
0,1,192
1,2,287
2,3,179
3,4,189
4,5,269


In [7]:
rul.columns = ['unit', 'max_cycle']

In [8]:
data = data.merge(rul, on=['unit'], how='left')
data.head(5)

Unnamed: 0,unit,cycle,os1,os2,os3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,max_cycle
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,192
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,192
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,192
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,192
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,192


In [9]:
data['RUL'] = data['max_cycle'] - data['cycle']
data.head()

Unnamed: 0,unit,cycle,os1,os2,os3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,max_cycle,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,192,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,192,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,192,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,192,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,192,187


In [10]:
data.drop('max_cycle', axis=1, inplace=True)

# Data normalization : MinMax

In [11]:
data['cycle_norm'] = data['cycle']
cols_normalize = data.columns.difference(['unit', 'cycle','RUL'])

In [12]:
min_max_scaler = preprocessing.MinMaxScaler()

In [13]:
norm_train = pd.DataFrame(min_max_scaler.fit_transform(data[cols_normalize]), 
                             columns=cols_normalize, 
                             index=data.index)

join = data[data.columns.difference(cols_normalize)].join(norm_train)
data = join.reindex(columns = data.columns)
data.head()

Unnamed: 0,unit,cycle,os1,os2,os3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,RUL,cycle_norm
0,1,1,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,0.199608,0.363986,0.0,0.333333,0.0,0.0,0.713178,0.724662,191,0.0
1,1,2,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,0.162813,0.411312,0.0,0.333333,0.0,0.0,0.666667,0.731014,190,0.00277
2,1,3,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,0.171793,0.357445,0.0,0.166667,0.0,0.0,0.627907,0.621375,189,0.00554
3,1,4,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,0.174889,0.166603,0.0,0.333333,0.0,0.0,0.573643,0.662386,188,0.00831
4,1,5,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.174734,0.402078,0.0,0.416667,0.0,0.0,0.589147,0.704502,187,0.01108


# Modeling

In [14]:
y = data['RUL']
X = data.drop(columns=["RUL"])

In [15]:
X.shape, y.shape

((20631, 27), (20631,))

In [16]:
rg = RandomForestRegressor()

In [17]:
n = int(len(X) * 0.7)
X_train = np.asarray(X)[:n, :]
X_test = np.asarray(X)[n:, :]
y_train = y[:n]
y_test = y[n:]
print X_train.shape, X_test.shape

(14441, 27) (6190, 27)


In [18]:
rg.fit(X_train, y_train)
y_pred = rg.predict(X_test)

In [19]:
mean_absolute_error(y_test, y_pred) / np.mean(y_test)

0.28502673796791445