feat(anomaly-detection): add n_anomalies output and performance warnings to One-Class SVM

artemisTurintech · paulsbrookes · commit acd5d8ada334 · 2025-10-23T16:00:55.000+01:00
diff --git a/src/anomaly_detection.py b/src/anomaly_detection.py
@@ -15,6 +15,7 @@
 
 import pandas as pd
 import numpy as np
+import warnings
 from sklearn.ensemble import IsolationForest
 from sklearn.svm import OneClassSVM
 from typing import Union, Dict
@@ -269,7 +270,7 @@ def detect_anomalies_one_class_svm(
     Returns
     -------
     dict
-        Dictionary containing three keys:
+        Dictionary containing four keys:
         - "predictions" : np.ndarray
             Array of predictions where -1 indicates anomaly and 1 indicates normal.
             Shape: (n_samples,)
@@ -281,6 +282,8 @@ def detect_anomalies_one_class_svm(
         - "model" : OneClassSVM
             The trained OneClassSVM model object, which can be reused for
             predictions on new data using model.predict() or model.decision_function().
+        - "n_anomalies" : int
+            Count of detected anomalies (number of predictions equal to -1).
     
     Raises
     ------
@@ -324,10 +327,12 @@ def detect_anomalies_one_class_svm(
     Notes
     -----
     - Input data MUST be scaled for One-Class SVM to work properly
-    - Training can be slow for large datasets (>50k rows)
-    - RBF kernel usually performs best for anomaly detection
+    - Training can be slow for large datasets (>50k rows) with non-linear kernels
+    - For large datasets (100,000+ rows), prefer 'linear' kernel for speed
+    - RBF kernel usually performs best for anomaly detection but is slower
     - The nu parameter acts as an upper bound, actual anomaly rate may be lower
     - Decision scores are not probabilities, just distances from the boundary
+    - A performance warning will be issued for datasets with 100,000+ rows using non-linear kernels
     
     References
     ----------
@@ -378,6 +383,17 @@ def detect_anomalies_one_class_svm(
     if not isinstance(random_state, int):
         raise TypeError(f"random_state must be an integer, got {type(random_state).__name__}")
     
+    # Performance warning for large datasets with non-linear kernels
+    n_samples = len(data)
+    if n_samples >= 100000 and kernel != 'linear':
+        warnings.warn(
+            f"Training One-Class SVM with {n_samples} samples and '{kernel}' kernel may be slow. "
+            f"For large datasets (100,000+ rows), consider using kernel='linear' for better performance. "
+            f"Expected training time with '{kernel}' kernel may be significantly longer.",
+            UserWarning,
+            stacklevel=2
+        )
+    
     # Initialize One-Class SVM model
     model = OneClassSVM(
         nu=nu,
@@ -394,9 +410,13 @@ def detect_anomalies_one_class_svm(
     # Get decision function scores: more negative = stronger anomaly
     scores = model.decision_function(data)
     
+    # Count the number of detected anomalies
+    n_anomalies = int((predictions == -1).sum())
+    
     # Return results as a dictionary
     return {
         "predictions": predictions,
         "scores": scores,
-        "model": model
+        "model": model,
+        "n_anomalies": n_anomalies
     }