In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

import optuna


In [2]:
df = pd.read_parquet("df_preprocessed.parquet")

train_cycles = ["A1", "B1", "B3"]
val_cycle    = "A2"
test_cycle   = "B2"

df_train = df[df["cycle_id"].isin(train_cycles)].copy()
df_val   = df[df["cycle_id"] == val_cycle].copy()
df_test  = df[df["cycle_id"] == test_cycle].copy()

# reduced feature set
feature_cols_all = [
    
    "delta_p_mean", 
    "delta_p_std",
    "delta_p_trend",
    "output_mean",
    "output_std",
    "output_trend",
    "active_pressure_fraction"

]

X_train_full = df_train[feature_cols_all]
X_val_full   = df_val[feature_cols_all]
X_test_full  = df_test[feature_cols_all]

y_train_full = df_train["label"]
y_val_full   = df_val["label"]
y_test_full  = df_test["label"]



In [3]:
# One class svm prep


# Select only no-failure windows for training
df_train_ano = df_train[df_train["label"] == 0]

X_train_ano = df_train_ano[feature_cols_all]
X_val_ano   = df_val[feature_cols_all]
X_test_ano  = df_test[feature_cols_all]

y_val_ano = df_val["label"]
y_test_ano = df_test["label"]

# Median imputation (torque )
imputer_ano = SimpleImputer(strategy="median")

X_train_ano_imp = pd.DataFrame(imputer_ano.fit_transform(X_train_ano), columns=feature_cols_all)
X_val_ano_imp   = pd.DataFrame(imputer_ano.transform(X_val_ano), columns=feature_cols_all)
X_test_ano_imp  = pd.DataFrame(imputer_ano.transform(X_test_ano), columns=feature_cols_all)



# one class svm hyperparameter tuning

def ocsvm_objective(trial):

    params = {
        "kernel": trial.suggest_categorical("kernel", ["rbf", "poly"]),
        "nu": trial.suggest_float("nu", 0.01, 0.20),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
    }

    pipe = Pipeline([
        ("scale", StandardScaler()),
        ("ocsvm", OneClassSVM(**params))
    ])

    pipe.fit(X_train_ano_imp)

    scores = pipe.decision_function(X_val_ano_imp)
    thresholds = np.linspace(np.percentile(scores, 1),
                             np.percentile(scores, 40), 50)

    best_f1 = 0
    for t in thresholds:
        preds = (scores < t).astype(int)
        best_f1 = max(best_f1, f1_score(y_val_ano, preds, zero_division=0))

    return best_f1




In [4]:
# one class svm running optuna
study_ocsvm = optuna.create_study(direction="maximize")
study_ocsvm.optimize(ocsvm_objective, n_trials=40)

best_params_ocsvm = study_ocsvm.best_params
best_params_ocsvm




[I 2025-12-01 13:23:11,615] A new study created in memory with name: no-name-72af9f91-3c7a-4f3c-b342-5b9fe4698dd2
[I 2025-12-01 13:23:12,666] Trial 0 finished with value: 0.0 and parameters: {'kernel': 'poly', 'nu': 0.12523428326261285, 'gamma': 'auto'}. Best is trial 0 with value: 0.0.
[I 2025-12-01 13:23:12,764] Trial 1 finished with value: 0.4692737430167598 and parameters: {'kernel': 'rbf', 'nu': 0.014060084654766256, 'gamma': 'auto'}. Best is trial 1 with value: 0.4692737430167598.
[I 2025-12-01 13:23:13,721] Trial 2 finished with value: 0.0 and parameters: {'kernel': 'poly', 'nu': 0.17790546550356734, 'gamma': 'auto'}. Best is trial 1 with value: 0.4692737430167598.
[I 2025-12-01 13:23:14,294] Trial 3 finished with value: 0.0 and parameters: {'kernel': 'poly', 'nu': 0.02268322691459712, 'gamma': 'scale'}. Best is trial 1 with value: 0.4692737430167598.
[I 2025-12-01 13:23:15,181] Trial 4 finished with value: 0.0 and parameters: {'kernel': 'poly', 'nu': 0.04933156156449897, 'gamma

{'kernel': 'rbf', 'nu': 0.010470896573940014, 'gamma': 'auto'}

In [6]:
#evaluate on val and test

def eval_anomaly(model, X, y, threshold):
    scores = model.decision_function(X)
    preds = (scores < threshold).astype(int)

    return {
        "Precision": precision_score(y, preds, zero_division=0),
        "Recall": recall_score(y, preds, zero_division=0),
        "F1": f1_score(y, preds, zero_division=0),
        "ROC-AUC": roc_auc_score(y, scores)
    }


oc_val_metrics = eval_anomaly(ocsvm_final, X_val_ano_imp, y_val_ano, best_thresh)
oc_test_metrics = eval_anomaly(ocsvm_final, X_test_ano_imp, y_test_ano, best_thresh)

print("OC-SVM Validation:", oc_val_metrics)
print("OC-SVM Test:", oc_test_metrics)



OC-SVM Validation: {'Precision': 0.3071297989031079, 'Recall': 1.0, 'F1': 0.4699300699300699, 'ROC-AUC': np.float64(0.09117743822196228)}
OC-SVM Test: {'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'ROC-AUC': np.float64(0.5165220786896318)}


In [7]:

# Isolation forest prep

# -only no failure windows:
healthy_train_if = df_train[df_train["label"] == 0]

X_train_if_raw = healthy_train_if[feature_cols_all]
X_val_if_raw   = X_val_full
X_test_if_raw  = X_test_full

y_val_if  = y_val_full
y_test_if = y_test_full

# Impute torque
imp_if = SimpleImputer(strategy="median")
X_train_if = imp_if.fit_transform(X_train_if_raw)
X_val_if   = imp_if.transform(X_val_if_raw)
X_test_if  = imp_if.transform(X_test_if_raw)


# objective:
def iforest_objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400),
        "max_samples": trial.suggest_float("max_samples", 0.6, 1.0),
        "contamination": trial.suggest_float("contamination", 0.01, 0.10),
        "random_state": 42
    }

    model = IsolationForest(**params)
    model.fit(X_train_if)

    scores = model.decision_function(X_val_if)
    ths = np.linspace(scores.min(), scores.max(), 200)

    best_f1 = 0
    for th in ths:
        preds = (scores < th).astype(int)
        best_f1 = max(best_f1, f1_score(y_val_if, preds, zero_division=0))

    return best_f1






In [8]:
# isolation forest running optuna:
study_if = optuna.create_study(direction="maximize")
study_if.optimize(iforest_objective, n_trials=40)

best_params_if = study_if.best_params
best_params_if



[I 2025-12-01 13:23:26,498] A new study created in memory with name: no-name-50442dfe-63b6-4287-8255-ab0e36729904
[I 2025-12-01 13:23:26,927] Trial 0 finished with value: 0.12197483059051308 and parameters: {'n_estimators': 110, 'max_samples': 0.954429677086815, 'contamination': 0.026148661770486286}. Best is trial 0 with value: 0.12197483059051308.
[I 2025-12-01 13:23:28,013] Trial 1 finished with value: 0.11403209286445885 and parameters: {'n_estimators': 371, 'max_samples': 0.9937184343094453, 'contamination': 0.01162842049340589}. Best is trial 0 with value: 0.12197483059051308.
[I 2025-12-01 13:23:29,012] Trial 2 finished with value: 0.11843496651392316 and parameters: {'n_estimators': 385, 'max_samples': 0.7919120786859198, 'contamination': 0.04937825286583539}. Best is trial 0 with value: 0.12197483059051308.
[I 2025-12-01 13:23:29,556] Trial 3 finished with value: 0.12417437252311757 and parameters: {'n_estimators': 164, 'max_samples': 0.8363866552890038, 'contamination': 0.074

{'n_estimators': 221,
 'max_samples': 0.8938584321269593,
 'contamination': 0.03731256415914801}

In [9]:
# Isolation forest fit model
if_best = IsolationForest(**best_params_if, random_state=42)
if_best.fit(X_train_if)

# Scores
val_scores_if = if_best.decision_function(X_val_if)
test_scores_if = if_best.decision_function(X_test_if)

# Threshold search
ths = np.linspace(val_scores_if.min(), val_scores_if.max(), 300)
best_f1 = 0
best_th = None

for th in ths:
    preds = (val_scores_if < th).astype(int)
    f1 = f1_score(y_val_if, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_th = th

print("Best IF threshold:", best_th)
print("Best IF validation F1:", best_f1)


#isolation forest val + test
def eval_if(scores, y_true, threshold):
    preds = (scores < threshold).astype(int)
    return {
        "Precision": precision_score(y_true, preds, zero_division=0),
        "Recall": recall_score(y_true, preds, zero_division=0),
        "F1": f1_score(y_true, preds, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, -scores)  # invert for consistency
    }

metrics_val_if = eval_if(val_scores_if, y_val_if, best_th)
metrics_test_if = eval_if(test_scores_if, y_test_if, best_th)

print("IF Validation:", metrics_val_if)
print("IF Test:", metrics_test_if)



Best IF threshold: -0.01133582793238258
Best IF validation F1: 0.1398834304746045
IF Validation: {'Precision': 0.07520143240823635, 'Recall': 1.0, 'F1': 0.1398834304746045, 'ROC-AUC': np.float64(0.5590135619242579)}
IF Test: {'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'ROC-AUC': np.float64(0.4484566728582686)}
