In [1]:
# 02_Feature_Engineering.ipynb
# Time-series feature engineering + save to CSV

import os
import numpy as np
import pandas as pd

# Change if your folder is different
DATA_DIR = "../data/raw"
OUT_DIR = "../data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

train_path = os.path.join(DATA_DIR, "train_FD001.txt")
test_path  = os.path.join(DATA_DIR, "test_FD001.txt")
rul_path   = os.path.join(DATA_DIR, "RUL_FD001.txt")

# Column names for CMAPSS FD001 (has 21 sensors)
col_names = (
    ["unit", "time",
     "op_set_1", "op_set_2", "op_set_3"] +
    [f"sensor_{i}" for i in range(1, 22)]
)

# Load raw data
train_raw = pd.read_csv(train_path, sep=r"\s+", header=None,
                        names=col_names, engine="python")
test_raw = pd.read_csv(test_path, sep=r"\s+", header=None,
                       names=col_names, engine="python")
rul_raw = pd.read_csv(rul_path, sep=r"\s+", header=None, names=["RUL"])

print("Train raw shape:", train_raw.shape)
print("Test raw shape :", test_raw.shape)
print("RUL shape      :", rul_raw.shape)
train_raw.head()


Train raw shape: (20631, 26)
Test raw shape : (13096, 26)
RUL shape      : (100, 1)


Unnamed: 0,unit,time,op_set_1,op_set_2,op_set_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [2]:
# ----- Add RUL and binary label (failure within N cycles) -----

PRED_WINDOW = 30   # predict failure in next 30 cycles
print("Prediction window (cycles):", PRED_WINDOW)

def add_rul_and_label(df, window=PRED_WINDOW):
    """
    For each engine (unit) compute RUL at each time step and
    label = 1 if RUL <= window else 0.
    """
    df = df.copy()
    df = df.sort_values(["unit", "time"])

    rul_list = []
    label_list = []

    for u, g in df.groupby("unit"):
        g = g.sort_values("time")
        max_time = g["time"].max()
        # RUL is remaining cycles until failure
        rul = max_time - g["time"]
        label = (rul <= window).astype(int)

        rul_list.append(rul)
        label_list.append(label)

    df["RUL"] = pd.concat(rul_list, axis=0).sort_index()
    df["label"] = pd.concat(label_list, axis=0).sort_index()

    return df

train_labeled = add_rul_and_label(train_raw, window=PRED_WINDOW)

print("Labeled train shape:", train_labeled.shape)
train_labeled[["unit", "time", "RUL", "label"]].head()


Prediction window (cycles): 30
Labeled train shape: (20631, 28)


Unnamed: 0,unit,time,RUL,label
0,1,1,191,0
1,1,2,190,0
2,1,3,189,0
3,1,4,188,0
4,1,5,187,0


In [3]:
# ----- Time-series features: rolling mean/min/max -----

# Sensors to engineer features for
sensor_cols = [c for c in train_labeled.columns if c.startswith("sensor_")]

# Windows (in cycles)
windows_ma = [5, 10, 20]      # moving averages
windows_extremes = [10, 20]   # min / max windows

def add_rolling_features(df):
    """
    For each engine (unit), compute rolling stats using ONLY current
    and past time steps => no data leakage.
    """
    df = df.copy()
    df = df.sort_values(["unit", "time"])

    feat_dfs = []

    for u, g in df.groupby("unit"):
        g = g.sort_values("time").copy()

        # Moving averages
        for w in windows_ma:
            roll = g[sensor_cols].rolling(window=w, min_periods=1)
            g[[f"{c}_ma_{w}" for c in sensor_cols]] = roll.mean().values

        # Rolling min / max
        for w in windows_extremes:
            roll = g[sensor_cols].rolling(window=w, min_periods=1)
            g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
            g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values

        feat_dfs.append(g)

    out = pd.concat(feat_dfs, axis=0).sort_values(["unit", "time"])
    out.reset_index(drop=True, inplace=True)
    return out

train_features = add_rolling_features(train_labeled)

print("Feature dataframe shape:", train_features.shape)
train_features.head()


  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_max_{w}" for c in sensor_cols]] = roll.max().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min().values
  g[[f"{c}_min_{w}" for c in sensor_cols]] = roll.min()

Feature dataframe shape: (20631, 175)


Unnamed: 0,unit,time,op_set_1,op_set_2,op_set_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12_max_20,sensor_13_max_20,sensor_14_max_20,sensor_15_max_20,sensor_16_max_20,sensor_17_max_20,sensor_18_max_20,sensor_19_max_20,sensor_20_max_20,sensor_21_max_20
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.07,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8138.62,8.4318,0.03,392.0,2388.0,100.0,39.06,23.4236
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.86,2388.08,8138.62,8.4318,0.03,393.0,2388.0,100.0,39.06,23.4236


In [6]:
# ----- Keep only columns needed for modeling & save to CSV -----

# Columns we do NOT want as model inputs:
drop_cols = ["RUL", "time"]  # we keep 'unit' for possible grouping

train_features_model = train_features.drop(columns=drop_cols)

print("Final feature set shape:", train_features_model.shape)

out_path = os.path.join(OUT_DIR, "train_features_FD001_no_leak.csv")
train_features_model.to_csv(out_path, index=False)
print("Saved processed features to:", out_path)


Final feature set shape: (20631, 173)
Saved processed features to: ../data/processed\train_features_FD001_no_leak.csv
