# Preprocessing

## Loading Imports and Data

#### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

#### Loading Data

In [32]:
project_root = Path().resolve().parent

train_FD001_data = pd.read_csv(project_root/'data/raw/CMAPSSData/train_FD001.txt', sep=' ', header=None)
test_FD001_data = pd.read_csv(project_root/'data/raw/CMAPSSData/test_FD001.txt', sep=' ', header=None)

train_FD002_data = pd.read_csv(project_root/'data/raw/CMAPSSData/train_FD002.txt', sep=' ', header=None)
test_FD002_data = pd.read_csv(project_root/'data/raw/CMAPSSData/test_FD002.txt', sep=' ', header=None)

train_FD003_data = pd.read_csv(project_root/'data/raw/CMAPSSData/train_FD003.txt', sep=' ', header=None)
test_FD003_data = pd.read_csv(project_root/'data/raw/CMAPSSData/test_FD003.txt', sep=' ', header=None)

train_FD004_data = pd.read_csv(project_root/'data/raw/CMAPSSData/train_FD004.txt', sep=' ', header=None)
test_FD004_data = pd.read_csv(project_root/'data/raw/CMAPSSData/test_FD004.txt', sep=' ', header=None)

## Data Cleaning

Dropping Empty Columns

In [33]:
train_FD001_data = train_FD001_data.dropna(axis=1, how='all')
test_FD001_data = test_FD001_data.dropna(axis=1, how='all')
train_FD002_data = train_FD002_data.dropna(axis=1, how='all')
test_FD002_data = test_FD002_data.dropna(axis=1, how='all')
train_FD003_data = train_FD003_data.dropna(axis=1, how='all')
test_FD003_data = test_FD003_data.dropna(axis=1, how='all')
train_FD004_data = train_FD004_data.dropna(axis=1, how='all')
test_FD004_data = test_FD004_data.dropna(axis=1, how='all')

print(train_FD001_data.isnull().sum())
print(test_FD001_data.isnull().sum())
print(train_FD002_data.isnull().sum())
print(test_FD002_data.isnull().sum())
print(train_FD003_data.isnull().sum())
print(test_FD003_data.isnull().sum())
print(train_FD004_data.isnull().sum())
print(test_FD004_data.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14  

## Data Processing

Lets add an extra column on each dataset with the RUL value on each row.

In [34]:
datasets = [train_FD001_data,train_FD002_data,train_FD003_data,train_FD004_data]
sensor_start = 5
sensor_end = 25

for dataset in datasets:
    unique_units = np.unique(dataset[0])
    max_cycles = {unit: dataset[dataset[0] == unit][1].max() for unit in unique_units}
    dataset['RUL'] = np.array([max_cycles[unit] - cycle for unit, cycle in zip(dataset[0], dataset[1])])

print(train_FD001_data.head())

   0  1       2       3      4       5       6        7        8      9  ...  \
0  1  1 -0.0007 -0.0004  100.0  518.67  641.82  1589.70  1400.60  14.62  ...   
1  1  2  0.0019 -0.0003  100.0  518.67  642.15  1591.82  1403.14  14.62  ...   
2  1  3 -0.0043  0.0003  100.0  518.67  642.35  1587.99  1404.20  14.62  ...   
3  1  4  0.0007  0.0000  100.0  518.67  642.35  1582.79  1401.87  14.62  ...   
4  1  5 -0.0019 -0.0002  100.0  518.67  642.37  1582.85  1406.22  14.62  ...   

        17       18      19    20   21    22     23     24       25  RUL  
0  2388.02  8138.62  8.4195  0.03  392  2388  100.0  39.06  23.4190  191  
1  2388.07  8131.49  8.4318  0.03  392  2388  100.0  39.00  23.4236  190  
2  2388.03  8133.23  8.4178  0.03  390  2388  100.0  38.95  23.3442  189  
3  2388.08  8133.83  8.3682  0.03  392  2388  100.0  38.88  23.3739  188  
4  2388.04  8133.80  8.4294  0.03  393  2388  100.0  38.90  23.4044  187  

[5 rows x 27 columns]


Now for train_FD001 and train_FD003 lets drop the sensors that have a correlation < 0.1 or NA with the RUL. Lets keep all sensor data for train_FD002 and train_FD004.

In [35]:
test_dataset = [test_FD001_data, test_FD003_data]

for i, dataset in enumerate([train_FD001_data, train_FD003_data]):
    features = dataset.iloc[:, 2:-1]
    rul = dataset['RUL']
    
    correlations = abs(features.corrwith(rul))
    
    columns_to_drop = [sensor_idx for sensor_idx in correlations.index if correlations[sensor_idx] < 0.1 or pd.isna(correlations[sensor_idx])]
    
    dataset.drop(columns=columns_to_drop, inplace=True)
    
    test_dataset[i].drop(columns=columns_to_drop, inplace=True)

Now that we have that out of the way, lets normalize the remaining operational settings and sensor data.

We will scale the RUL data in the modeling notebook for convenience as we will need the scaler instance when inverse scaling the predicted RUL values.

In [36]:
# Create separate scalers for each dataset
FD001_scaler = StandardScaler()
FD002_scaler = StandardScaler()
FD003_scaler = StandardScaler()
FD004_scaler = StandardScaler()

# Map datasets to their respective scalers
dataset_scaler_map = {
    "FD001": (train_FD001_data, test_FD001_data, FD001_scaler),
    "FD002": (train_FD002_data, test_FD002_data, FD002_scaler),
    "FD003": (train_FD003_data, test_FD003_data, FD003_scaler),
    "FD004": (train_FD004_data, test_FD004_data, FD004_scaler),
}

# Scale each dataset separately
for dataset_name, (train_data, test_data, scaler) in dataset_scaler_map.items():
    # Process training data
    features_train = train_data.iloc[:, 2:-1]  # Exclude first two columns (engine ID, time step) and last column (RUL)
    target_train = train_data.iloc[:, -1]  # Extract the RUL column

    # Scale and replace features in the training dataset
    train_data.iloc[:, 2:-1] = scaler.fit_transform(features_train)
    train_data.iloc[:, -1] = target_train  # Re-attach the unscaled RUL column

    # Process test data (no RUL column in test data)
    features_test = test_data.iloc[:, 2:]  # Exclude only the first two columns (engine ID, time step)
    test_data.iloc[:, 2:] = scaler.transform(features_test)

    # Verify scaling
    print(f"{dataset_name} - Scaled Training Data:")
    print(train_data.head())
    print(f"{dataset_name} - Scaled Test Data:")
    print(test_data.head())

FD001 - Scaled Training Data:
   0  1         6         7         8        10        11        12        13  \
0  1  1 -1.721725 -0.134255 -0.925936  0.141683  1.121141 -0.516338 -0.862813   
1  1  2 -1.061780  0.211528 -0.643726  0.141683  0.431930 -0.798093 -0.958818   
2  1  3 -0.661813 -0.413166 -0.525953  0.141683  1.008155 -0.234584 -0.557139   
3  1  4 -0.661813 -1.261314 -0.784831  0.141683  1.222827  0.188048 -0.713826   
4  1  5 -0.621816 -1.251528 -0.301518  0.141683  0.714393 -0.516338 -0.457059   

         15        16        17        18        19        21        24  \
0 -0.266467  0.334262 -1.058890 -0.269071 -0.603816 -0.781710  1.348493   
1 -0.191583  1.174899 -0.363646 -0.642845 -0.275852 -0.781710  1.016528   
2 -1.015303  1.364721 -0.919841 -0.551629 -0.649144 -2.073094  0.739891   
3 -1.539489  1.961302 -0.224597 -0.520176 -1.971665 -0.781710  0.352598   
4 -0.977861  1.052871 -0.780793 -0.521748 -0.339845 -0.136018  0.463253   

         25  RUL  
0  1.194427  

## Output Cleaned Data to CSV

In [None]:
train_FD001_data.to_csv(project_root/'data/processed/train_FD001.csv', index=False)
test_FD001_data.to_csv(project_root/'data/processed/test_FD001.csv', index=False)

train_FD002_data.to_csv(project_root/'data/processed/train_FD002.csv', index=False)
test_FD002_data.to_csv(project_root/'data/processed/test_FD002.csv', index=False)

train_FD003_data.to_csv(project_root/'data/processed/train_FD003.csv', index=False)
test_FD003_data.to_csv(project_root/'data/processed/test_FD003.csv', index=False)

train_FD004_data.to_csv(project_root/'data/processed/train_FD004.csv', index=False)
test_FD004_data.to_csv(project_root/'data/processed/test_FD004.csv', index=False)