In [1]:
import os
import sys
import json
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# change working directory
os.chdir('/Users/syahrulhamdani/Desktop/thesis/predictive-maintenance-of-aircraft-engine/')
print('Working directory: ', os.getcwd())

Working directory:  /Users/syahrulhamdani/Desktop/thesis/predictive-maintenance-of-aircraft-engine


In [2]:
!ls

README.md  [1m[36mdata[m[m       [1m[36mnotebooks[m[m  [1m[36mreferences[m[m [1m[36mreports[m[m    [1m[36msrc[m[m


In [3]:
with open('references/col_to_feat.json', ) as f:
    feature_names = json.load(f)
feature_names = list(feature_names.values())
dataset = pd.read_csv('data/raw/train_FD001.txt', header=None, names=feature_names, sep='\s+')
pprint(dataset.head(5), compact=True, width=50)
pprint(dataset.shape)

   EngineID  Cycle  OpSetting1  OpSetting2  OpSetting3      T2     T24  \
0         1      1     -0.0007     -0.0004       100.0  518.67  641.82   
1         1      2      0.0019     -0.0003       100.0  518.67  642.15   
2         1      3     -0.0043      0.0003       100.0  518.67  642.35   
3         1      4      0.0007      0.0000       100.0  518.67  642.35   
4         1      5     -0.0019     -0.0002       100.0  518.67  642.37   

       T30      T50     P2   ...        phi      NRf      NRc     BPR  farB  \
0  1589.70  1400.60  14.62   ...     521.66  2388.02  8138.62  8.4195  0.03   
1  1591.82  1403.14  14.62   ...     522.28  2388.07  8131.49  8.4318  0.03   
2  1587.99  1404.20  14.62   ...     522.42  2388.03  8133.23  8.4178  0.03   
3  1582.79  1401.87  14.62   ...     522.86  2388.08  8133.83  8.3682  0.03   
4  1582.85  1406.22  14.62   ...     522.19  2388.04  8133.80  8.4294  0.03   

   htBleed  Nf_dmd  PCNfR_dmd    W31      W32  
0      392    2388      100.0  3

In [4]:
','.join(feature_names)

'EngineID,Cycle,OpSetting1,OpSetting2,OpSetting3,T2,T24,T30,T50,P2,P15,P30,Nf,Nc,epr,Ps30,phi,NRf,NRc,BPR,farB,htBleed,Nf_dmd,PCNfR_dmd,W31,W32'

Create a data class that will load particular dataset with input `filename.format`. Here, the steps are:

1. Create a dict that maps key value to the existing dataset in the directory
2. Initialization must pass `filename` and `directory`.

In [5]:
!ls data/raw

RUL_FD001.txt   RUL_FD004.txt   test_FD003.txt  train_FD002.txt
RUL_FD002.txt   test_FD001.txt  test_FD004.txt  train_FD003.txt
RUL_FD003.txt   test_FD002.txt  train_FD001.txt train_FD004.txt


Create dict mapping the key to dataset filename:

```python
key_to_file = dict()
for directory, _, files in os.walk('data/raw/'):
    for file in files:
        if file.endswith('.txt'):
            key_to_file[file.split('.')[0]] = file
print(key_to_file)
```

In [6]:
def list_dataset(path='data/raw'):
    """return list of dataset exist in the `path`."""
    key_to_file = dict()
    for directory, _, files in os.walk('data/raw/'):
        for file in files:
            if file.endswith('.txt'):
                key_to_file[file.split('.')[0]] = file
    
    return key_to_file

class LoadData:
    key_to_file = dict()
    for directory, _, files in os.walk('data/raw/'):
        for file in files:
            if file.endswith('.txt'):
                key_to_file[file.split('.')[0]] = file
    
    def __init__(self, dict_to_file, folder='data/raw', names=None, sep='\s+'):
        """Load the dataset with name `filename` from `path`.
        
        parameters
        ----------
        filename (str): the name of dataset exist in path
        folder (str): directory where the data exist
        
        attributes
        ----------
        features: data features
        target: data labels
        """
        # load the data
        file = os.path.join(folder, dict_to_file)
        dataset = pd.read_csv(file, sep=sep, names=names)
        self.features = dataset.values
        self.target = self.__get_rul(dataset, names)
    
    def __get_rul(self, data, names):
        """return the remaining useful life for each cycle for each EngineID."""
        num_engine = pd.unique(data.iloc[:, 0]).shape[0]
        num_cycle = [data.loc[data[names[0]]==i, names[0]].shape[0] for i in range(1, num_engine+1)]
        rul = np.array([])
        for engine in range(num_engine):
            diff = num_cycle[engine] - data.loc[data[names[0]]==engine+1, names[1]].values
            rul = np.append(rul, diff)
        
        return rul

In [7]:
feature_mapping = list_dataset()
feature_mapping

{'train_FD001': 'train_FD001.txt',
 'train_FD003': 'train_FD003.txt',
 'RUL_FD004': 'RUL_FD004.txt',
 'train_FD002': 'train_FD002.txt',
 'RUL_FD001': 'RUL_FD001.txt',
 'RUL_FD003': 'RUL_FD003.txt',
 'RUL_FD002': 'RUL_FD002.txt',
 'train_FD004': 'train_FD004.txt',
 'test_FD003': 'test_FD003.txt',
 'test_FD002': 'test_FD002.txt',
 'test_FD001': 'test_FD001.txt',
 'test_FD004': 'test_FD004.txt'}

In [8]:
data = LoadData(feature_mapping['train_FD001'], names=feature_names)
print(type(data.features), type(data.target))
print(data.features[:5])
print(data.target[:5])

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
[[ 1.00000e+00  1.00000e+00 -7.00000e-04 -4.00000e-04  1.00000e+02
   5.18670e+02  6.41820e+02  1.58970e+03  1.40060e+03  1.46200e+01
   2.16100e+01  5.54360e+02  2.38806e+03  9.04619e+03  1.30000e+00
   4.74700e+01  5.21660e+02  2.38802e+03  8.13862e+03  8.41950e+00
   3.00000e-02  3.92000e+02  2.38800e+03  1.00000e+02  3.90600e+01
   2.34190e+01]
 [ 1.00000e+00  2.00000e+00  1.90000e-03 -3.00000e-04  1.00000e+02
   5.18670e+02  6.42150e+02  1.59182e+03  1.40314e+03  1.46200e+01
   2.16100e+01  5.53750e+02  2.38804e+03  9.04407e+03  1.30000e+00
   4.74900e+01  5.22280e+02  2.38807e+03  8.13149e+03  8.43180e+00
   3.00000e-02  3.92000e+02  2.38800e+03  1.00000e+02  3.90000e+01
   2.34236e+01]
 [ 1.00000e+00  3.00000e+00 -4.30000e-03  3.00000e-04  1.00000e+02
   5.18670e+02  6.42350e+02  1.58799e+03  1.40420e+03  1.46200e+01
   2.16100e+01  5.54260e+02  2.38808e+03  9.05294e+03  1.30000e+00
   4.72700e+01  5.22420e+02  2.38803e+03  8.13323

In [9]:
processed_data = np.concatenate((data.features, data.target.reshape(data.target.shape[0], -1)), axis=1)
print(processed_data.shape)

(20631, 27)


Save processed data into `data/processed`

In [12]:
feature_title = np.append(feature_names, 'RUL')
np.savetxt('data/processed/processed.csv', processed_data, delimiter=',', header=','.join(feature_title),
           comments='', fmt='%.3f')