In [50]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

import optuna


In [51]:
df = pd.read_parquet("df_preprocessed.parquet")

train_cycles = ["A1", "B1", "B3"]
val_cycle    = "A2"
test_cycle   = "B2"

df_train = df[df["cycle_id"].isin(train_cycles)].copy()
df_val   = df[df["cycle_id"] == val_cycle].copy()
df_test  = df[df["cycle_id"] == test_cycle].copy()

# Full feature set
feature_cols_all = [
    c for c in df_train.columns
    if c not in ["pump_id", "cycle_id", "end_time", "label"]
]

X_train_full = df_train[feature_cols_all]
X_val_full   = df_val[feature_cols_all]
X_test_full  = df_test[feature_cols_all]

y_train_full = df_train["label"]
y_val_full   = df_val["label"]
y_test_full  = df_test["label"]



In [52]:
# One class svm prep


# Select only no-failure windows for training
df_train_ano = df_train[df_train["label"] == 0]

X_train_ano = df_train_ano[feature_cols_all]
X_val_ano   = df_val[feature_cols_all]
X_test_ano  = df_test[feature_cols_all]

y_val_ano = df_val["label"]
y_test_ano = df_test["label"]

# Median imputation (torque )
imputer_ano = SimpleImputer(strategy="median")

X_train_ano_imp = pd.DataFrame(imputer_ano.fit_transform(X_train_ano), columns=feature_cols_all)
X_val_ano_imp   = pd.DataFrame(imputer_ano.transform(X_val_ano), columns=feature_cols_all)
X_test_ano_imp  = pd.DataFrame(imputer_ano.transform(X_test_ano), columns=feature_cols_all)



# one class svm hyperparameter tuning

def ocsvm_objective(trial):

    params = {
        "kernel": trial.suggest_categorical("kernel", ["rbf", "poly"]),
        "nu": trial.suggest_float("nu", 0.01, 0.20),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
    }

    pipe = Pipeline([
        ("scale", StandardScaler()),
        ("ocsvm", OneClassSVM(**params))
    ])

    pipe.fit(X_train_ano_imp)

    scores = pipe.decision_function(X_val_ano_imp)
    thresholds = np.linspace(np.percentile(scores, 1),
                             np.percentile(scores, 40), 50)

    best_f1 = 0
    for t in thresholds:
        preds = (scores < t).astype(int)
        best_f1 = max(best_f1, f1_score(y_val_ano, preds, zero_division=0))

    return best_f1




In [53]:
# one class svm running optuna
study_ocsvm = optuna.create_study(direction="maximize")
study_ocsvm.optimize(ocsvm_objective, n_trials=40)

best_params_ocsvm = study_ocsvm.best_params
best_params_ocsvm




[I 2025-12-01 13:17:30,570] A new study created in memory with name: no-name-2c108c68-f7fd-42b5-b67d-f854c4af04da
[I 2025-12-01 13:17:31,079] Trial 0 finished with value: 0.11659192825112108 and parameters: {'kernel': 'rbf', 'nu': 0.07538782260486931, 'gamma': 'auto'}. Best is trial 0 with value: 0.11659192825112108.
[I 2025-12-01 13:17:31,645] Trial 1 finished with value: 0.11325611325611326 and parameters: {'kernel': 'rbf', 'nu': 0.09257407890107919, 'gamma': 'auto'}. Best is trial 0 with value: 0.11659192825112108.
[I 2025-12-01 13:17:31,790] Trial 2 finished with value: 0.12263535551206783 and parameters: {'kernel': 'rbf', 'nu': 0.013567368269417363, 'gamma': 'scale'}. Best is trial 2 with value: 0.12263535551206783.
[I 2025-12-01 13:17:32,257] Trial 3 finished with value: 0.11702127659574468 and parameters: {'kernel': 'rbf', 'nu': 0.07356696755933731, 'gamma': 'auto'}. Best is trial 2 with value: 0.12263535551206783.
[I 2025-12-01 13:17:32,878] Trial 4 finished with value: 0.11568

{'kernel': 'poly', 'nu': 0.1968809599578709, 'gamma': 'auto'}

In [54]:
# one class fit best params
ocsvm_final = Pipeline([
    ("scale", StandardScaler()),
    ("ocsvm", OneClassSVM(**best_params_ocsvm))
])
ocsvm_final.fit(X_train_ano_imp)

# ---- Threshold search on validation 
scores_val = ocsvm_final.decision_function(X_val_ano_imp)
thresholds = np.linspace(np.percentile(scores_val, 1),
                         np.percentile(scores_val, 40), 50)

best_thresh = None
best_f1_val = -1

for t in thresholds:
    preds = (scores_val < t).astype(int)
    f1 = f1_score(y_val_ano, preds, zero_division=0)
    if f1 > best_f1_val:
        best_thresh = t
        best_f1_val = f1

print("Best OC-SVM threshold:", best_thresh)
print("Best OC-SVM validation F1:", best_f1_val)


Best OC-SVM threshold: -3.329323217128773
Best OC-SVM validation F1: 0.12791991101223582


In [55]:
#evaluate on val and test

def eval_anomaly(model, X, y, threshold):
    scores = model.decision_function(X)
    preds = (scores < threshold).astype(int)

    return {
        "Precision": precision_score(y, preds, zero_division=0),
        "Recall": recall_score(y, preds, zero_division=0),
        "F1": f1_score(y, preds, zero_division=0),
        "ROC-AUC": roc_auc_score(y, scores)
    }


oc_val_metrics = eval_anomaly(ocsvm_final, X_val_ano_imp, y_val_ano, best_thresh)
oc_test_metrics = eval_anomaly(ocsvm_final, X_test_ano_imp, y_test_ano, best_thresh)

print("OC-SVM Validation:", oc_val_metrics)
print("OC-SVM Test:", oc_test_metrics)



OC-SVM Validation: {'Precision': 0.0705521472392638, 'Recall': 0.6845238095238095, 'F1': 0.12791991101223582, 'ROC-AUC': np.float64(0.3427493054540137)}
OC-SVM Test: {'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'ROC-AUC': np.float64(0.744450424969042)}


In [56]:

# Isolation forest prep

# -only no failure windows:
healthy_train_if = df_train[df_train["label"] == 0]

X_train_if_raw = healthy_train_if[feature_cols_all]
X_val_if_raw   = X_val_full
X_test_if_raw  = X_test_full

y_val_if  = y_val_full
y_test_if = y_test_full

# Impute torque
imp_if = SimpleImputer(strategy="median")
X_train_if = imp_if.fit_transform(X_train_if_raw)
X_val_if   = imp_if.transform(X_val_if_raw)
X_test_if  = imp_if.transform(X_test_if_raw)


# objective:
def iforest_objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400),
        "max_samples": trial.suggest_float("max_samples", 0.6, 1.0),
        "contamination": trial.suggest_float("contamination", 0.01, 0.10),
        "random_state": 42
    }

    model = IsolationForest(**params)
    model.fit(X_train_if)

    scores = model.decision_function(X_val_if)
    ths = np.linspace(scores.min(), scores.max(), 200)

    best_f1 = 0
    for th in ths:
        preds = (scores < th).astype(int)
        best_f1 = max(best_f1, f1_score(y_val_if, preds, zero_division=0))

    return best_f1






In [57]:
# isolation forest running optuna:
study_if = optuna.create_study(direction="maximize")
study_if.optimize(iforest_objective, n_trials=40)

best_params_if = study_if.best_params
best_params_if



[I 2025-12-01 13:19:59,722] A new study created in memory with name: no-name-6d2f897a-753c-4099-95fb-65174a4de744
[I 2025-12-01 13:20:00,403] Trial 0 finished with value: 0.07871788828658968 and parameters: {'n_estimators': 219, 'max_samples': 0.7955949716730366, 'contamination': 0.021612528983026777}. Best is trial 0 with value: 0.07871788828658968.
[I 2025-12-01 13:20:01,217] Trial 1 finished with value: 0.07871788828658968 and parameters: {'n_estimators': 252, 'max_samples': 0.9804308637364729, 'contamination': 0.08234111558975715}. Best is trial 0 with value: 0.07871788828658968.
[I 2025-12-01 13:20:01,788] Trial 2 finished with value: 0.07871788828658968 and parameters: {'n_estimators': 177, 'max_samples': 0.9126258169099815, 'contamination': 0.09006421902210708}. Best is trial 0 with value: 0.07871788828658968.
[I 2025-12-01 13:20:02,856] Trial 3 finished with value: 0.07965860597439545 and parameters: {'n_estimators': 397, 'max_samples': 0.8779410715949818, 'contamination': 0.05

{'n_estimators': 243,
 'max_samples': 0.6249081071367667,
 'contamination': 0.08957483860572622}

In [58]:
# Isolation forest fit model
if_best = IsolationForest(**best_params_if, random_state=42)
if_best.fit(X_train_if)

# Scores
val_scores_if = if_best.decision_function(X_val_if)
test_scores_if = if_best.decision_function(X_test_if)

# Threshold search
ths = np.linspace(val_scores_if.min(), val_scores_if.max(), 300)
best_f1 = 0
best_th = None

for th in ths:
    preds = (val_scores_if < th).astype(int)
    f1 = f1_score(y_val_if, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_th = th

print("Best IF threshold:", best_th)
print("Best IF validation F1:", best_f1)


#isolation forest val + test
def eval_if(scores, y_true, threshold):
    preds = (scores < threshold).astype(int)
    return {
        "Precision": precision_score(y_true, preds, zero_division=0),
        "Recall": recall_score(y_true, preds, zero_division=0),
        "F1": f1_score(y_true, preds, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, -scores)  # invert for consistency
    }

metrics_val_if = eval_if(val_scores_if, y_val_if, best_th)
metrics_test_if = eval_if(test_scores_if, y_test_if, best_th)

print("IF Validation:", metrics_val_if)
print("IF Test:", metrics_test_if)



Best IF threshold: 0.008067007890468991
Best IF validation F1: 0.0819833087874325
IF Validation: {'Precision': 0.0427547363031234, 'Recall': 0.9940476190476191, 'F1': 0.0819833087874325, 'ROC-AUC': np.float64(0.28245022420431837)}
IF Test: {'Precision': 0.06575342465753424, 'Recall': 1.0, 'F1': 0.12339331619537275, 'ROC-AUC': np.float64(0.467460669255882)}
