# Notebook 1: Organizing the dataset

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
def load_data(data_path):   
    operational_settings = ['op_setting_{}'.format(i + 1) for i in range (3)]
    sensor_columns = ['sensor_meas_{}'.format(i + 1) for i in range(26)]
    cols = ['engine_no', 'time_in_cycles'] + operational_settings + sensor_columns
    
    data = pd.read_csv(data_path, sep=' ', header=-1, names=cols)
    data = data.drop(cols[-5:], axis=1)
    data['index'] = data.index
    data.index = data['index']
    data['time'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='600s')
#     print('Loaded data with:\n{} Recordings\n{} Engines'.format(
#         data.shape[0], len(data['engine_no'].unique())))
#     print('21 Sensor Measurements\n3 Operational Settings')
    return data

In [3]:
def load_target(data_path):
    cols = ['RUL']
    data = pd.read_csv(data_path, sep=' ', header=-1, names=cols, index_col=False)
    return data

In [4]:
def make_list(targetx):
    targetx = targetx.values
    target_x=[]
    for item in targetx:
        target_x.append(item[0])
    return target_x

In [5]:
PATH   = Path('../data/CMAPSSData')

In [6]:
# Load dos trains e targets
train01 = load_data(PATH/'train_FD001.txt')
target01 = load_target(PATH/'RUL_FD001.txt')

train02 = load_data(PATH/'train_FD002.txt')
target02 = load_target(PATH/'RUL_FD002.txt')

train03 = load_data(PATH/'train_FD003.txt')
target03 = load_target(PATH/'RUL_FD003.txt')

train04 = load_data(PATH/'train_FD004.txt')
target04 = load_target(PATH/'RUL_FD004.txt')

In [7]:
target02.head(5)

Unnamed: 0,RUL
0,18
1,79
2,106
3,110
4,15


In [8]:
# Arrange engine_no to align with target index
train01['engine_no'] = train01['engine_no'].apply(lambda x: x-1)
train02['engine_no'] = train02['engine_no'].apply(lambda x: x-1)
train03['engine_no'] = train03['engine_no'].apply(lambda x: x-1)
train04['engine_no'] = train04['engine_no'].apply(lambda x: x-1)

In [9]:
# Transformar o target numa lista
target_01= make_list(target01)
target_02= make_list(target02)
target_03= make_list(target03)
target_04= make_list(target04)

In [10]:
# Verificar quais DS precisa dropar a última linha
print("01: ",train01['engine_no'].unique().shape, " ", len(target_01))
print("02: ",train02['engine_no'].unique().shape, " ", len(target_02))
print("03: ",train03['engine_no'].unique().shape, " ", len(target_03))
print("04: ",train04['engine_no'].unique().shape, " ", len(target_04))

01:  (100,)   100
02:  (260,)   259
03:  (100,)   100
04:  (249,)   248


In [11]:
# Dropar todoo último engine dos DS 02 
filtered = train02.query('@train02.engine_no == 259')
to_drop = list(filtered['index'].values)

for row in to_drop:
    train02 = train02.drop([row], axis=0)

In [12]:
# Dropar todoo último engine dos DS 02
filtered = train04.query('@train04.engine_no == 248')
to_drop = list(filtered['index'].values)

for row in to_drop:
    train04 = train04.drop([row], axis=0)

In [13]:
# Conferir tamanhos
print("01: ",train01['engine_no'].unique().shape, " ", len(target_01))
print("02: ",train02['engine_no'].unique().shape, " ", len(target_02))
print("03: ",train03['engine_no'].unique().shape, " ", len(target_03))
print("04: ",train04['engine_no'].unique().shape, " ", len(target_04))

01:  (100,)   100
02:  (259,)   259
03:  (100,)   100
04:  (248,)   248


In [14]:
# anotar como target correto para cada engine
train01['target'] = train01['engine_no'].apply(lambda x: target_01[x])
train02['target'] = train02['engine_no'].apply(lambda x: target_02[x])
train03['target'] = train03['engine_no'].apply(lambda x: target_03[x])
train04['target'] = train04['engine_no'].apply(lambda x: target_04[x])

# Criar eng_fam

In [15]:
# incluir a coluna com a família do motor
train01['engine_family'] = 1
train02['engine_family'] = 2
train03['engine_family'] = 3
train04['engine_family'] = 4

In [16]:
def make_eng_fam(df):
    df['eng_fam'] = ''
    for i in range(len(df)):
        engine = df.at[i,'engine_no']
        engine = str(int(engine))
        family = str(int(df.at[i, 'engine_family']))
        df.at[i,'eng_fam'] = engine + '_' + family
    return df

In [17]:
train01 = make_eng_fam(train01)

In [18]:
train02 = make_eng_fam(train02)
train03 = make_eng_fam(train03)
train04 = make_eng_fam(train04)

In [19]:
train01.head(5)

Unnamed: 0_level_0,engine_no,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_meas_1,sensor_meas_2,sensor_meas_3,sensor_meas_4,sensor_meas_5,...,sensor_meas_17,sensor_meas_18,sensor_meas_19,sensor_meas_20,sensor_meas_21,index,time,target,engine_family,eng_fam
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,392,2388,100.0,39.06,23.419,0,2000-01-01 00:00:00,112,1,0_1
1,0,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,392,2388,100.0,39.0,23.4236,1,2000-01-01 00:10:00,112,1,0_1
2,0,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,390,2388,100.0,38.95,23.3442,2,2000-01-01 00:20:00,112,1,0_1
3,0,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,392,2388,100.0,38.88,23.3739,3,2000-01-01 00:30:00,112,1,0_1
4,0,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,393,2388,100.0,38.9,23.4044,4,2000-01-01 00:40:00,112,1,0_1


# sort by time

In [20]:
train01 = train01.sort_values(by='time')
train02 = train02.sort_values(by='time')
train03 = train03.sort_values(by='time')
train04 = train04.sort_values(by='time')

# Concat all 4 datasets

In [21]:
data = pd.concat([train01, train02, train03, train04], axis = 0, sort=False)

In [22]:
data.isnull().sum()

engine_no         0
time_in_cycles    0
op_setting_1      0
op_setting_2      0
op_setting_3      0
sensor_meas_1     0
sensor_meas_2     0
sensor_meas_3     0
sensor_meas_4     0
sensor_meas_5     0
sensor_meas_6     0
sensor_meas_7     0
sensor_meas_8     0
sensor_meas_9     0
sensor_meas_10    0
sensor_meas_11    0
sensor_meas_12    0
sensor_meas_13    0
sensor_meas_14    0
sensor_meas_15    0
sensor_meas_16    0
sensor_meas_17    0
sensor_meas_18    0
sensor_meas_19    0
sensor_meas_20    0
sensor_meas_21    0
index             0
time              0
target            0
engine_family     0
eng_fam           0
dtype: int64

In [23]:
print(train01.shape)
print(train02.shape)
print(train03.shape)
print(train04.shape)
print(data.shape)

(20631, 31)
(53443, 31)
(24720, 31)
(60994, 31)
(159788, 31)


In [24]:
# trocar time por datetime
time = pd.to_datetime(data['time'])
data.drop(['time'], axis=1, inplace=True)
data = pd.concat([data, time], axis=1)

In [25]:
# trocar int por float
for col in ['engine_no', 'time_in_cycles', 'sensor_meas_17','sensor_meas_18', 'engine_family']:
    ints = data[col].astype('float64')
    data.drop([col], axis=1, inplace=True)
    data = pd.concat([data, ints], axis=1)

In [28]:
# persist new dataframe
data.to_hdf('data1.h5', key='data', mode='w')

end =================================================================