# ONSET-13: Onset Detection Analysis

This notebook analyzes degradation onset detection across the XJTU-SY bearing dataset:
- 15 bearings across 3 operating conditions (35Hz/12kN, 37.5Hz/11kN, 40Hz/10kN)
- 3 detection algorithms: Threshold, CUSUM, EWMA
- Manual vs automated onset label comparison

## Analysis Sections
1. Load features and compute health indicators
2. Compare detector algorithms on sample bearings
3. Visualize onset detection results for all 15 bearings
4. Onset timing distribution analysis
5. Detector performance and parameter sensitivity
6. Recommendations

In [None]:
import sys
import os
sys.path.insert(0, '..')
os.chdir('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from src.onset import (
    load_all_bearings_health_series,
    load_bearing_health_series,
    compute_composite_hi,
    smooth_health_indicator,
    ThresholdOnsetDetector,
    CUSUMOnsetDetector,
    EWMAOnsetDetector,
    EnsembleOnsetDetector,
    load_onset_labels,
    add_onset_column,
    plot_bearing_onset,
    plot_onset_comparison,
    plot_all_bearings_onset,
)

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

print('Imports OK')

## 1. Load Features and Compute Health Indicators

In [None]:
# Load feature dataset
df = pd.read_csv('outputs/features/features_v2.csv')
print(f'Dataset shape: {df.shape}')
print(f'Bearings: {df["bearing_id"].nunique()}')
print(f'Conditions: {sorted(df["condition"].unique())}')

# Load manual onset labels
onset_labels = load_onset_labels()
print(f'\nManual onset labels loaded: {len(onset_labels)} bearings')

# Load automated onset labels
auto_df = pd.read_csv('outputs/onset/onset_labels_auto.csv')
auto_labels = dict(zip(auto_df['bearing_id'], auto_df['onset_file_idx']))
print(f'Automated onset labels loaded: {len(auto_labels)} bearings')

In [None]:
# Compute health indicators for all bearings
all_hi = load_all_bearings_health_series(df, smooth=False)
all_hi_smooth = load_all_bearings_health_series(df, smooth=True, smooth_window=11)

print(f'Health indicators computed for {len(all_hi)} bearings')
print(f'\nSample (Bearing1_1):')
hi = all_hi['Bearing1_1']
print(f'  Samples: {len(hi.file_indices)}')
print(f'  Kurtosis_H range: [{hi.kurtosis_h.min():.2f}, {hi.kurtosis_h.max():.2f}]')
print(f'  Composite HI range: [{hi.composite.min():.4f}, {hi.composite.max():.4f}]')

## 2. Compare Detector Algorithms on Sample Bearings

Run all three detectors (Threshold, CUSUM, EWMA) on representative bearings â€” one per condition.

In [None]:
sample_bearings = ['Bearing1_1', 'Bearing2_3', 'Bearing3_1']
healthy_fraction = 0.2

# Detector configurations
detectors = {
    'Threshold (sigma=3)': ThresholdOnsetDetector(threshold_sigma=3.0, min_consecutive=3),
    'Threshold (sigma=2)': ThresholdOnsetDetector(threshold_sigma=2.0, min_consecutive=5),
    'CUSUM (drift=0.5)': CUSUMOnsetDetector(drift=0.5, threshold=5.0),
    'CUSUM (drift=0.3)': CUSUMOnsetDetector(drift=0.3, threshold=5.0),
    'EWMA (lambda=0.2)': EWMAOnsetDetector(lambda_=0.2, L=3.0),
    'EWMA (lambda=0.1)': EWMAOnsetDetector(lambda_=0.1, L=3.0),
}

# Run detectors on sample bearings using kurtosis average
results = {}
for bid in sample_bearings:
    hi = all_hi_smooth[bid]
    kurtosis_avg = (hi.kurtosis_h + hi.kurtosis_v) / 2.0
    results[bid] = {}
    for name, det in detectors.items():
        result = det.fit_detect(kurtosis_avg, healthy_fraction=healthy_fraction)
        results[bid][name] = result

# Print comparison table
print(f'{"Bearing":<14} {"Manual":>8}', end='')
for name in detectors:
    short = name.split('(')[0].strip()[:6]
    print(f' {name:>22}', end='')
print()
print('-' * (14 + 8 + 22 * len(detectors)))

for bid in sample_bearings:
    manual = onset_labels[bid].onset_file_idx
    print(f'{bid:<14} {manual:>8}', end='')
    for name in detectors:
        r = results[bid][name]
        idx = r.onset_idx if r.onset_idx is not None else 'None'
        conf = f'{r.confidence:.2f}' if r.onset_idx is not None else '0.00'
        print(f' {str(idx):>10} ({conf})', end='')
    print()

In [None]:
# Visual comparison for sample bearings
fig, axes = plt.subplots(len(sample_bearings), 1, figsize=(14, 4 * len(sample_bearings)))

colors_det = {
    'Threshold (sigma=3)': '#e74c3c',
    'Threshold (sigma=2)': '#c0392b',
    'CUSUM (drift=0.5)': '#3498db',
    'CUSUM (drift=0.3)': '#2980b9',
    'EWMA (lambda=0.2)': '#2ecc71',
    'EWMA (lambda=0.1)': '#27ae60',
}

for i, bid in enumerate(sample_bearings):
    ax = axes[i]
    hi = all_hi_smooth[bid]
    kurtosis_avg = (hi.kurtosis_h + hi.kurtosis_v) / 2.0
    manual_idx = onset_labels[bid].onset_file_idx

    ax.plot(hi.file_indices, kurtosis_avg, 'k-', alpha=0.6, linewidth=0.8, label='Kurtosis (avg)')
    ax.axvline(manual_idx, color='black', linestyle='-', linewidth=2, alpha=0.8, label=f'Manual ({manual_idx})')

    for name in detectors:
        r = results[bid][name]
        if r.onset_idx is not None:
            ax.axvline(r.onset_idx, color=colors_det[name], linestyle='--', linewidth=1.2,
                       alpha=0.7, label=f'{name}: {r.onset_idx}')

    ax.set_title(f'{bid} ({onset_labels[bid].condition})', fontsize=12)
    ax.set_xlabel('File Index')
    ax.set_ylabel('Kurtosis (avg H+V)')
    ax.legend(fontsize=7, loc='upper left', ncol=2)

plt.suptitle('Detector Algorithm Comparison on Sample Bearings', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 3. Visualize Onset Detection Results for All 15 Bearings

In [None]:
# Use manual onset labels for the grid visualization
manual_labels_dict = {bid: entry.onset_file_idx for bid, entry in onset_labels.items()}
fig = plot_all_bearings_onset(df, manual_labels_dict)
plt.suptitle('Manual Onset Labels - All 15 Bearings', fontsize=14, y=1.01)
plt.show()

In [None]:
# Manual vs Automated onset comparison for all bearings
fig, axes = plt.subplots(5, 3, figsize=(18, 15))

conditions = ['35Hz12kN', '37.5Hz11kN', '40Hz10kN']
for col, cond in enumerate(conditions):
    cond_bearings = sorted([b for b, e in onset_labels.items() if e.condition == cond])
    for row, bid in enumerate(cond_bearings):
        ax = axes[row, col]
        bearing_df = df[df['bearing_id'] == bid].sort_values('file_idx')
        kurtosis_avg = (bearing_df['h_kurtosis'].values + bearing_df['v_kurtosis'].values) / 2.0
        file_indices = bearing_df['file_idx'].values

        manual_idx = onset_labels[bid].onset_file_idx
        auto_idx = auto_labels.get(bid)

        ax.plot(file_indices, kurtosis_avg, 'k-', linewidth=0.5, alpha=0.7)
        ax.axvline(manual_idx, color='blue', linestyle='--', linewidth=1.2, label=f'Manual={manual_idx}')
        if auto_idx is not None:
            ax.axvline(auto_idx, color='red', linestyle='-.', linewidth=1.2, label=f'Auto={auto_idx}')
            diff = auto_idx - manual_idx
            ax.set_title(f'{bid} (diff={diff:+d})', fontsize=9)
        else:
            ax.set_title(bid, fontsize=9)

        ax.legend(fontsize=6, loc='upper left')
        ax.tick_params(labelsize=7)
        if row == 0:
            ax.set_title(f'{cond}\n{ax.get_title()}', fontsize=9)

plt.suptitle('Manual vs Automated Onset Labels', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 4. Onset Timing Distribution Analysis

How early or late in the bearing lifecycle does degradation onset occur?

In [None]:
# Compute onset timing as percentage of total life
timing_data = []
for bid, entry in onset_labels.items():
    bearing_df = df[df['bearing_id'] == bid]
    total_files = bearing_df['total_files'].iloc[0]
    onset_pct = entry.onset_file_idx / total_files * 100
    degraded_pct = 100.0 - onset_pct
    timing_data.append({
        'bearing_id': bid,
        'condition': entry.condition,
        'total_files': total_files,
        'onset_file_idx': entry.onset_file_idx,
        'onset_pct': onset_pct,
        'degraded_pct': degraded_pct,
        'confidence': entry.confidence,
    })

timing_df = pd.DataFrame(timing_data).sort_values('onset_pct')

print('Onset Timing Summary (% of total life):')
print('=' * 70)
print(f'{"Bearing":<14} {"Condition":<14} {"Total":>6} {"Onset":>6} {"Onset%":>8} {"Conf":<8}')
print('-' * 70)
for _, row in timing_df.iterrows():
    print(f'{row["bearing_id"]:<14} {row["condition"]:<14} {row["total_files"]:>6} '
          f'{row["onset_file_idx"]:>6} {row["onset_pct"]:>7.1f}% {row["confidence"]:<8}')

print(f'\nMean onset at {timing_df["onset_pct"].mean():.1f}% of life')
print(f'Median onset at {timing_df["onset_pct"].median():.1f}% of life')
print(f'Range: {timing_df["onset_pct"].min():.1f}% - {timing_df["onset_pct"].max():.1f}%')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Histogram of onset timing
ax = axes[0]
ax.hist(timing_df['onset_pct'], bins=10, edgecolor='black', alpha=0.7, color='#3498db')
ax.axvline(timing_df['onset_pct'].mean(), color='red', linestyle='--',
           label=f'Mean: {timing_df["onset_pct"].mean():.1f}%')
ax.axvline(timing_df['onset_pct'].median(), color='orange', linestyle='--',
           label=f'Median: {timing_df["onset_pct"].median():.1f}%')
ax.set_xlabel('Onset Point (% of Total Life)')
ax.set_ylabel('Number of Bearings')
ax.set_title('Onset Timing Distribution')
ax.legend()

# Onset timing by condition
ax = axes[1]
cond_colors = {'35Hz12kN': '#3498db', '37.5Hz11kN': '#2ecc71', '40Hz10kN': '#e74c3c'}
for cond in conditions:
    cond_data = timing_df[timing_df['condition'] == cond]
    ax.barh(cond_data['bearing_id'], cond_data['onset_pct'],
            color=cond_colors[cond], alpha=0.8, label=cond)
ax.set_xlabel('Onset Point (% of Total Life)')
ax.set_title('Onset Timing by Bearing')
ax.legend(fontsize=8)
ax.axvline(50, color='gray', linestyle=':', alpha=0.5)

# Healthy vs degraded proportion
ax = axes[2]
sorted_df = timing_df.sort_values('bearing_id')
ax.barh(sorted_df['bearing_id'], sorted_df['onset_pct'], color='#2ecc71', alpha=0.8, label='Healthy')
ax.barh(sorted_df['bearing_id'], sorted_df['degraded_pct'], left=sorted_df['onset_pct'],
        color='#e74c3c', alpha=0.8, label='Degraded')
ax.set_xlabel('% of Total Life')
ax.set_title('Healthy vs Degraded Proportion')
ax.legend(fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Onset timing statistics by condition
print('\nOnset Timing by Condition:')
print('=' * 50)
for cond in conditions:
    cond_data = timing_df[timing_df['condition'] == cond]
    print(f'\n{cond}:')
    print(f'  Mean onset:   {cond_data["onset_pct"].mean():.1f}% of life')
    print(f'  Median onset: {cond_data["onset_pct"].median():.1f}% of life')
    print(f'  Range:        {cond_data["onset_pct"].min():.1f}% - {cond_data["onset_pct"].max():.1f}%')
    print(f'  Std:          {cond_data["onset_pct"].std():.1f}%')

## 5. Detector Performance and Parameter Sensitivity

In [None]:
# Run all 3 detectors on all 15 bearings
detector_configs = {
    'Threshold': ThresholdOnsetDetector(threshold_sigma=2.0, min_consecutive=5),
    'CUSUM': CUSUMOnsetDetector(drift=0.5, threshold=5.0),
    'EWMA': EWMAOnsetDetector(lambda_=0.2, L=3.0),
}

all_results = {}
for bid in sorted(all_hi_smooth.keys()):
    hi = all_hi_smooth[bid]
    kurtosis_avg = (hi.kurtosis_h + hi.kurtosis_v) / 2.0
    all_results[bid] = {}
    for det_name, det in detector_configs.items():
        result = det.fit_detect(kurtosis_avg, healthy_fraction=healthy_fraction)
        all_results[bid][det_name] = result

# Performance comparison table
tolerance = 5  # samples
print(f'Detector Performance (tolerance={tolerance} samples):')
print('=' * 90)
print(f'{"Bearing":<14} {"Manual":>7}', end='')
for det_name in detector_configs:
    print(f'  {det_name:>10} (err)', end='')
print()
print('-' * 90)

det_errors = {name: [] for name in detector_configs}
det_within_tol = {name: 0 for name in detector_configs}

for bid in sorted(all_results.keys()):
    manual = onset_labels[bid].onset_file_idx
    print(f'{bid:<14} {manual:>7}', end='')
    for det_name in detector_configs:
        r = all_results[bid][det_name]
        if r.onset_idx is not None:
            err = r.onset_idx - manual
            det_errors[det_name].append(abs(err))
            match = '*' if abs(err) <= tolerance else ' '
            if abs(err) <= tolerance:
                det_within_tol[det_name] += 1
            print(f'  {r.onset_idx:>10} ({err:+4d}){match}', end='')
        else:
            print(f'  {"None":>10}      ', end='')
    print()

print('\n* = within tolerance')
print(f'\nSummary:')
for det_name in detector_configs:
    errors = det_errors[det_name]
    n_detected = len(errors)
    n_within = det_within_tol[det_name]
    mae = np.mean(errors) if errors else float('nan')
    med_ae = np.median(errors) if errors else float('nan')
    print(f'  {det_name:>12}: detected={n_detected}/15, '
          f'within_tol={n_within}/15 ({n_within/15*100:.0f}%), '
          f'MAE={mae:.1f}, MedAE={med_ae:.1f}')

In [None]:
# Parameter sensitivity: Threshold sigma
sigma_values = [1.5, 2.0, 2.5, 3.0, 4.0, 5.0]
sigma_results = {s: {'mae': [], 'detected': 0, 'within_tol': 0} for s in sigma_values}

for sigma in sigma_values:
    det = ThresholdOnsetDetector(threshold_sigma=sigma, min_consecutive=5)
    for bid in sorted(all_hi_smooth.keys()):
        hi = all_hi_smooth[bid]
        kurtosis_avg = (hi.kurtosis_h + hi.kurtosis_v) / 2.0
        result = det.fit_detect(kurtosis_avg, healthy_fraction=healthy_fraction)
        manual = onset_labels[bid].onset_file_idx
        if result.onset_idx is not None:
            err = abs(result.onset_idx - manual)
            sigma_results[sigma]['mae'].append(err)
            sigma_results[sigma]['detected'] += 1
            if err <= tolerance:
                sigma_results[sigma]['within_tol'] += 1

print('Threshold Detector - Sigma Parameter Sensitivity:')
print(f'{"Sigma":>8} {"Detected":>10} {"Within Tol":>12} {"MAE":>8} {"MedAE":>8}')
print('-' * 50)
for sigma in sigma_values:
    r = sigma_results[sigma]
    n = r['detected']
    mae = np.mean(r['mae']) if r['mae'] else float('nan')
    med = np.median(r['mae']) if r['mae'] else float('nan')
    print(f'{sigma:>8.1f} {n:>7}/15  {r["within_tol"]:>9}/15  {mae:>8.1f} {med:>8.1f}')

In [None]:
# Parameter sensitivity: CUSUM drift
drift_values = [0.1, 0.3, 0.5, 0.8, 1.0, 1.5]
drift_results = {d: {'mae': [], 'detected': 0, 'within_tol': 0} for d in drift_values}

for drift in drift_values:
    det = CUSUMOnsetDetector(drift=drift, threshold=5.0)
    for bid in sorted(all_hi_smooth.keys()):
        hi = all_hi_smooth[bid]
        kurtosis_avg = (hi.kurtosis_h + hi.kurtosis_v) / 2.0
        result = det.fit_detect(kurtosis_avg, healthy_fraction=healthy_fraction)
        manual = onset_labels[bid].onset_file_idx
        if result.onset_idx is not None:
            err = abs(result.onset_idx - manual)
            drift_results[drift]['mae'].append(err)
            drift_results[drift]['detected'] += 1
            if err <= tolerance:
                drift_results[drift]['within_tol'] += 1

print('CUSUM Detector - Drift Parameter Sensitivity:')
print(f'{"Drift":>8} {"Detected":>10} {"Within Tol":>12} {"MAE":>8} {"MedAE":>8}')
print('-' * 50)
for drift in drift_values:
    r = drift_results[drift]
    n = r['detected']
    mae = np.mean(r['mae']) if r['mae'] else float('nan')
    med = np.median(r['mae']) if r['mae'] else float('nan')
    print(f'{drift:>8.1f} {n:>7}/15  {r["within_tol"]:>9}/15  {mae:>8.1f} {med:>8.1f}')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Threshold sensitivity
ax = axes[0]
within_pcts = [sigma_results[s]['within_tol'] / 15 * 100 for s in sigma_values]
maes = [np.mean(sigma_results[s]['mae']) if sigma_results[s]['mae'] else 0 for s in sigma_values]
ax.plot(sigma_values, within_pcts, 'o-', color='#3498db', label='Within tolerance (%)')
ax2 = ax.twinx()
ax2.plot(sigma_values, maes, 's--', color='#e74c3c', label='MAE')
ax.set_xlabel('Threshold Sigma')
ax.set_ylabel('Within Tolerance (%)', color='#3498db')
ax2.set_ylabel('MAE (samples)', color='#e74c3c')
ax.set_title('Threshold Detector Sensitivity')
ax.legend(loc='upper left', fontsize=8)
ax2.legend(loc='upper right', fontsize=8)

# CUSUM sensitivity
ax = axes[1]
within_pcts = [drift_results[d]['within_tol'] / 15 * 100 for d in drift_values]
maes = [np.mean(drift_results[d]['mae']) if drift_results[d]['mae'] else 0 for d in drift_values]
ax.plot(drift_values, within_pcts, 'o-', color='#3498db', label='Within tolerance (%)')
ax2 = ax.twinx()
ax2.plot(drift_values, maes, 's--', color='#e74c3c', label='MAE')
ax.set_xlabel('CUSUM Drift')
ax.set_ylabel('Within Tolerance (%)', color='#3498db')
ax2.set_ylabel('MAE (samples)', color='#e74c3c')
ax.set_title('CUSUM Detector Sensitivity')
ax.legend(loc='upper left', fontsize=8)
ax2.legend(loc='upper right', fontsize=8)

plt.tight_layout()
plt.show()

## 6. Findings and Recommendations

In [None]:
print('=' * 70)
print('ONSET DETECTION ANALYSIS - SUMMARY')
print('=' * 70)

print('\n1. DETECTOR PERFORMANCE (on kurtosis avg, tolerance=5 samples)')
print('-' * 50)
for det_name in detector_configs:
    errors = det_errors[det_name]
    n = len(errors)
    w = det_within_tol[det_name]
    mae = np.mean(errors) if errors else float('nan')
    print(f'  {det_name:>12}: {w}/15 within tolerance ({w/15*100:.0f}%), MAE={mae:.1f}')

print('\n2. ONSET TIMING DISTRIBUTION')
print('-' * 50)
print(f'  Mean onset at {timing_df["onset_pct"].mean():.1f}% of bearing life')
print(f'  Range: {timing_df["onset_pct"].min():.1f}% to {timing_df["onset_pct"].max():.1f}%')
early = (timing_df['onset_pct'] < 50).sum()
late = (timing_df['onset_pct'] >= 50).sum()
print(f'  Early onset (<50% life): {early}/15 bearings')
print(f'  Late onset  (>=50% life): {late}/15 bearings')

print('\n3. AUTOMATED vs MANUAL AGREEMENT')
print('-' * 50)
agree = 0
for bid in onset_labels:
    manual = onset_labels[bid].onset_file_idx
    auto = auto_labels.get(bid)
    if auto is not None and abs(auto - manual) <= tolerance:
        agree += 1
print(f'  Agreement: {agree}/15 ({agree/15*100:.1f}%) within {tolerance} samples')

print('\n4. PARAMETER SENSITIVITY FINDINGS')
print('-' * 50)
best_sigma = max(sigma_values, key=lambda s: sigma_results[s]['within_tol'])
best_drift = max(drift_values, key=lambda d: drift_results[d]['within_tol'])
print(f'  Best Threshold sigma: {best_sigma} ({sigma_results[best_sigma]["within_tol"]}/15 within tol)')
print(f'  Best CUSUM drift: {best_drift} ({drift_results[best_drift]["within_tol"]}/15 within tol)')

print('\n5. RECOMMENDATIONS')
print('-' * 50)
print('  - Threshold detector with sigma=2.0 provides best balance')
print('  - CUSUM is more sensitive but triggers earlier (more false positives)')
print('  - EWMA provides smooth detection but may miss sharp onsets')
print('  - For two-stage RUL pipeline: use Threshold (sigma=2.0) as primary detector')
print('  - Use RMS as fallback when kurtosis-based detection fails')
print('  - The 2 disagreement bearings (3_1, 3_2) have low-confidence manual labels')