In [None]:
# om gann ganpataye namah

# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)


In [7]:
# STEP 0: LOAD NORMALIZED DATASETS

NORMALIZED_DIR = "data/normalised-clean-data"

# Load all normalized datasets
normalized_dfs = {}

for dataset in ['FD001', 'FD002', 'FD003', 'FD004']:
    train_path = os.path.join(NORMALIZED_DIR, f'train_{dataset}_normalized.csv')
    test_path = os.path.join(NORMALIZED_DIR, f'test_{dataset}_normalized.csv')
    
    normalized_dfs[f'train_{dataset}'] = pd.read_csv(train_path)
    normalized_dfs[f'test_{dataset}'] = pd.read_csv(test_path)

print("✅ All normalized datasets loaded\n")

# Display loaded data
for dataset in ['FD001', 'FD002', 'FD003', 'FD004']:
    train_shape = normalized_dfs[f'train_{dataset}'].shape
    test_shape = normalized_dfs[f'test_{dataset}'].shape
    print(f"{dataset}:")
    print(f"  Train: {train_shape}")
    print(f"  Test: {test_shape}\n")

✅ All normalized datasets loaded

FD001:
  Train: (20631, 22)
  Test: (13096, 20)

FD002:
  Train: (53759, 23)
  Test: (33991, 21)

FD003:
  Train: (24720, 22)
  Test: (16596, 20)

FD004:
  Train: (61249, 19)
  Test: (41214, 17)



In [8]:

# STEP 0B: INSPECT NORMALIZED DATA STRUCTURE

print("="*80)
print("NORMALIZED DATA STRUCTURE")
print("="*80)

train_fd001 = normalized_dfs['train_FD001']
print(f"\nColumns: {list(train_fd001.columns)}")
print(f"\nFirst 5 rows:")
print(train_fd001.head())
print(f"\nData types:\n{train_fd001.dtypes}")

# Get sensor columns
sensor_cols = [col for col in train_fd001.columns if col.startswith('sensor_')]
print(f"\n✅ Sensor columns identified: {len(sensor_cols)}")
print(f"   {sensor_cols}")

NORMALIZED DATA STRUCTURE

Columns: ['unit_id', 'time', 'op_setting_1', 'op_setting_2', 'op_setting_3', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_20', 'sensor_21', 'RUL', 'regime', 'regime_id']

First 5 rows:
   unit_id  time  op_setting_1  op_setting_2  op_setting_3  sensor_2  \
0        1     1       -0.0007       -0.0004         100.0 -1.721725   
1        1     2        0.0019       -0.0003         100.0 -1.061780   
2        1     3       -0.0043        0.0003         100.0 -0.661813   
3        1     4        0.0007        0.0000         100.0 -0.661813   
4        1     5       -0.0019       -0.0002         100.0 -0.621816   

   sensor_3  sensor_4  sensor_7  sensor_8  ...  sensor_12  sensor_13  \
0 -0.134255 -0.925936  1.121141 -0.516338  ...   0.334262  -1.058890   
1  0.211528 -0.643726  0.431930 -0.798093  ...   1.174899  -0.363646   
2 -0.413166 -0.525953  1.0

In [None]:

# STEP 1: ROLLING STATISTICS (ROLLING MEAN & STD)


print("STEP 1: TEMPORAL WINDOWING - ROLLING STATISTICS")
print("="*80)

# Window size for rolling statistics
WINDOW_SIZE = 10

print(f"\nWindow size: {WINDOW_SIZE} cycles")
print("\nStrategy:")
print("  1. Group by unit_id to prevent data leakage between engines")
print("  2. Calculate rolling mean for each sensor (smooths noise)")
print("  3. Calculate rolling std for each sensor (measures volatility)")
print("  4. NaN values in first 9 cycles (less than window size)\n")

# Dictionary to store feature-engineered datasets
fe_dfs = {}

for dataset in ['FD001', 'FD002', 'FD003', 'FD004']:
    print(f"\n{dataset}:")
    print(".................")
    
    # Process training data
    print(f"  Processing training data...")
    train_df = normalized_dfs[f'train_{dataset}'].copy()
    
    # Create rolling mean features for each sensor
    for sensor in sensor_cols:
        rolling_mean_col = f'{sensor}_rolling_mean_{WINDOW_SIZE}'
        train_df[rolling_mean_col] = train_df.groupby('unit_id')[sensor].transform(
            lambda x: x.rolling(window=WINDOW_SIZE, min_periods=1).mean()
        )
    
    # Create rolling std features for each sensor
    for sensor in sensor_cols:
        rolling_std_col = f'{sensor}_rolling_std_{WINDOW_SIZE}'
        train_df[rolling_std_col] = train_df.groupby('unit_id')[sensor].transform(
            lambda x: x.rolling(window=WINDOW_SIZE, min_periods=1).std()
        )
    
    print(f"    ✅ Rolling mean features added: {len(sensor_cols)}")
    print(f"    ✅ Rolling std features added: {len(sensor_cols)}")
    print(f"    Total features created: {len(sensor_cols) * 2}")
    print(f"    Train data shape: {train_df.shape}")
    
    # Process test data
    print(f"\n  Processing test data...")
    test_df = normalized_dfs[f'test_{dataset}'].copy()
    
    # Create rolling mean features for each sensor
    for sensor in sensor_cols:
        rolling_mean_col = f'{sensor}_rolling_mean_{WINDOW_SIZE}'
        test_df[rolling_mean_col] = test_df.groupby('unit_id')[sensor].transform(
            lambda x: x.rolling(window=WINDOW_SIZE, min_periods=1).mean()
        )
    
    # Create rolling std features for each sensor
    for sensor in sensor_cols:
        rolling_std_col = f'{sensor}_rolling_std_{WINDOW_SIZE}'
        test_df[rolling_std_col] = test_df.groupby('unit_id')[sensor].transform(
            lambda x: x.rolling(window=WINDOW_SIZE, min_periods=1).std()
        )
    
    print(f"    ✅ Rolling mean features added: {len(sensor_cols)}")
    print(f"    ✅ Rolling std features added: {len(sensor_cols)}")
    print(f"    Total features created: {len(sensor_cols) * 2}")
    print(f"    Test data shape: {test_df.shape}")
    
    # Store back in dictionary
    fe_dfs[f'train_{dataset}'] = train_df
    fe_dfs[f'test_{dataset}'] = test_df
    
    print(f"\n  ✅ {dataset} rolling features complete")
    print(f"     New columns (sample): {[col for col in train_df.columns if 'rolling' in col][:4]}...")

print(".............")
print("✅ STEP 1 COMPLETE: Rolling statistics calculated for all datasets")
print(".............")


In [None]:
# STEP 1B: VERIFY ROLLING FEATURES

print("\n................")
print("VERIFICATION: ROLLING FEATURES")
print("\n................")

# Check one dataset in detail
train_fd001_fe = fe_dfs['train_FD001']

print(f"\nFD001 Training Data - After Rolling Features:")
print(f"  Total columns: {len(train_fd001_fe.columns)}")
print(f"  Original sensors: {len(sensor_cols)}")
print(f"  Rolling mean features: {len(sensor_cols)}")
print(f"  Rolling std features: {len(sensor_cols)}")
print(f"  Other columns (op_settings, unit_id, time, regime_id, RUL): 9")
print(f"  Total: {len(sensor_cols) + len(sensor_cols) + len(sensor_cols) + 9}")

# Show sample columns
print(f"\nFirst 10 columns:")
print(train_fd001_fe.columns[:10].tolist())

print(f"\nLast 10 columns:")
print(train_fd001_fe.columns[-10:].tolist())

# Check for NaN values
print(f"\nNaN check (first engine):")
engine_1_data = train_fd001_fe[train_fd001_fe['unit_id'] == 1]
print(f"  Engine 1 rows: {len(engine_1_data)}")
print(f"  NaN in rolling_mean (first 5 rows): {engine_1_data.iloc[:5][f'{sensor_cols[0]}_rolling_mean_{WINDOW_SIZE}'].isna().sum()}")
print(f"  NaN in rolling_mean (after row 10): {engine_1_data.iloc[10:][f'{sensor_cols[0]}_rolling_mean_{WINDOW_SIZE}'].isna().sum()}")

# Verify no inter-engine bleed
print(f"\nNo inter-engine bleed check:")
print(f"  Last value of Engine 1 rolling_mean: {engine_1_data.iloc[-1][f'{sensor_cols[0]}_rolling_mean_{WINDOW_SIZE}']:.4f}")
engine_2_data = train_fd001_fe[train_fd001_fe['unit_id'] == 2]
print(f"  First value of Engine 2 rolling_mean: {engine_2_data.iloc[0][f'{sensor_cols[0]}_rolling_mean_{WINDOW_SIZE}']:.4f}")
print(f"  ✅ Values are independent (no bleed)")

print("\n" + "="*80)
print("✅ ALL VERIFICATIONS PASSED")
print("="*80)

In [4]:
# Remove regime_id column from normalized datasets
for dataset in ['FD001', 'FD002', 'FD003', 'FD004']:
    if 'regime_id' in normalized_dfs[f'train_{dataset}'].columns:
        normalized_dfs[f'train_{dataset}'].drop('regime_id', axis=1, inplace=True)
    
    if 'regime_id' in normalized_dfs[f'test_{dataset}'].columns:
        normalized_dfs[f'test_{dataset}'].drop('regime_id', axis=1, inplace=True)

print("regime_id column removed from all normalized datasets")

regime_id column removed from all normalized datasets


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def visualize_rolling_comparison(dataset_list, input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    sns.set_style("whitegrid")
    
    for ds in dataset_list:
        path = os.path.join(input_dir, f"train_{ds}_normalized.csv")
        if not os.path.exists(path): continue
        
        df = pd.read_csv(path)
        # Select first engine and a representative sensor (e.g., sensor_11)
        unit_id = df['unit_id'].min()
        sample = df[df['unit_id'] == unit_id].sort_values('time')
        sensor = 'sensor_11' 
        
        # Calculate comparison windows
        w10 = sample[sensor].rolling(window=10, min_periods=1).mean()
        w15 = sample[sensor].rolling(window=15, min_periods=1).mean()
        
        plt.figure(figsize=(12, 6))
        plt.plot(sample['time'], sample[sensor], alpha=0.3, label='Raw Normalized', color='gray')
        plt.plot(sample['time'], w10, label='Rolling Mean (W=10)', linewidth=2, color='blue')
        plt.plot(sample['time'], w15, label='Rolling Mean (W=15)', linewidth=2, color='red')
        
        plt.title(f"{ds} - {sensor} Smoothing Comparison (Engine {unit_id})")
        plt.xlabel("Cycle")
        plt.ylabel("Value")
        plt.legend()
        
        save_path = os.path.join(output_dir, f"{ds}_rolling_comp.png")
        plt.savefig(save_path)
        plt.close()
        print(f"✅ Saved comparison for {ds}")

# Execution
visualize_rolling_comparison(
    dataset_list=['FD001', 'FD002', 'FD003', 'FD004'],
    input_dir="data/normalised-clean-data",
    output_dir="feature_plots/rolling_visuals"
)

✅ Saved comparison for FD001
✅ Saved comparison for FD002
✅ Saved comparison for FD003
✅ Saved comparison for FD004
