Skip to content

Commit acd5d8a

Browse files
artemisTurintechpaulsbrookes
authored andcommitted
feat(anomaly-detection): add n_anomalies output and performance warnings to One-Class SVM
1 parent d078f4a commit acd5d8a

File tree

1 file changed

+24
-4
lines changed

1 file changed

+24
-4
lines changed

src/anomaly_detection.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import pandas as pd
1717
import numpy as np
18+
import warnings
1819
from sklearn.ensemble import IsolationForest
1920
from sklearn.svm import OneClassSVM
2021
from typing import Union, Dict
@@ -269,7 +270,7 @@ def detect_anomalies_one_class_svm(
269270
Returns
270271
-------
271272
dict
272-
Dictionary containing three keys:
273+
Dictionary containing four keys:
273274
- "predictions" : np.ndarray
274275
Array of predictions where -1 indicates anomaly and 1 indicates normal.
275276
Shape: (n_samples,)
@@ -281,6 +282,8 @@ def detect_anomalies_one_class_svm(
281282
- "model" : OneClassSVM
282283
The trained OneClassSVM model object, which can be reused for
283284
predictions on new data using model.predict() or model.decision_function().
285+
- "n_anomalies" : int
286+
Count of detected anomalies (number of predictions equal to -1).
284287
285288
Raises
286289
------
@@ -324,10 +327,12 @@ def detect_anomalies_one_class_svm(
324327
Notes
325328
-----
326329
- Input data MUST be scaled for One-Class SVM to work properly
327-
- Training can be slow for large datasets (>50k rows)
328-
- RBF kernel usually performs best for anomaly detection
330+
- Training can be slow for large datasets (>50k rows) with non-linear kernels
331+
- For large datasets (100,000+ rows), prefer 'linear' kernel for speed
332+
- RBF kernel usually performs best for anomaly detection but is slower
329333
- The nu parameter acts as an upper bound, actual anomaly rate may be lower
330334
- Decision scores are not probabilities, just distances from the boundary
335+
- A performance warning will be issued for datasets with 100,000+ rows using non-linear kernels
331336
332337
References
333338
----------
@@ -378,6 +383,17 @@ def detect_anomalies_one_class_svm(
378383
if not isinstance(random_state, int):
379384
raise TypeError(f"random_state must be an integer, got {type(random_state).__name__}")
380385

386+
# Performance warning for large datasets with non-linear kernels
387+
n_samples = len(data)
388+
if n_samples >= 100000 and kernel != 'linear':
389+
warnings.warn(
390+
f"Training One-Class SVM with {n_samples} samples and '{kernel}' kernel may be slow. "
391+
f"For large datasets (100,000+ rows), consider using kernel='linear' for better performance. "
392+
f"Expected training time with '{kernel}' kernel may be significantly longer.",
393+
UserWarning,
394+
stacklevel=2
395+
)
396+
381397
# Initialize One-Class SVM model
382398
model = OneClassSVM(
383399
nu=nu,
@@ -394,9 +410,13 @@ def detect_anomalies_one_class_svm(
394410
# Get decision function scores: more negative = stronger anomaly
395411
scores = model.decision_function(data)
396412

413+
# Count the number of detected anomalies
414+
n_anomalies = int((predictions == -1).sum())
415+
397416
# Return results as a dictionary
398417
return {
399418
"predictions": predictions,
400419
"scores": scores,
401-
"model": model
420+
"model": model,
421+
"n_anomalies": n_anomalies
402422
}

0 commit comments

Comments
 (0)