1515
1616import  pandas  as  pd 
1717import  numpy  as  np 
18+ import  warnings 
1819from  sklearn .ensemble  import  IsolationForest 
1920from  sklearn .svm  import  OneClassSVM 
2021from  typing  import  Union , Dict 
@@ -269,7 +270,7 @@ def detect_anomalies_one_class_svm(
269270    Returns 
270271    ------- 
271272    dict 
272-         Dictionary containing three  keys: 
273+         Dictionary containing four  keys: 
273274        - "predictions" : np.ndarray 
274275            Array of predictions where -1 indicates anomaly and 1 indicates normal. 
275276            Shape: (n_samples,) 
@@ -281,6 +282,8 @@ def detect_anomalies_one_class_svm(
281282        - "model" : OneClassSVM 
282283            The trained OneClassSVM model object, which can be reused for 
283284            predictions on new data using model.predict() or model.decision_function(). 
285+         - "n_anomalies" : int 
286+             Count of detected anomalies (number of predictions equal to -1). 
284287     
285288    Raises 
286289    ------ 
@@ -324,10 +327,12 @@ def detect_anomalies_one_class_svm(
324327    Notes 
325328    ----- 
326329    - Input data MUST be scaled for One-Class SVM to work properly 
327-     - Training can be slow for large datasets (>50k rows) 
328-     - RBF kernel usually performs best for anomaly detection 
330+     - Training can be slow for large datasets (>50k rows) with non-linear kernels 
331+     - For large datasets (100,000+ rows), prefer 'linear' kernel for speed 
332+     - RBF kernel usually performs best for anomaly detection but is slower 
329333    - The nu parameter acts as an upper bound, actual anomaly rate may be lower 
330334    - Decision scores are not probabilities, just distances from the boundary 
335+     - A performance warning will be issued for datasets with 100,000+ rows using non-linear kernels 
331336     
332337    References 
333338    ---------- 
@@ -378,6 +383,17 @@ def detect_anomalies_one_class_svm(
378383    if  not  isinstance (random_state , int ):
379384        raise  TypeError (f"random_state must be an integer, got { type (random_state ).__name__ }  )
380385
386+     # Performance warning for large datasets with non-linear kernels 
387+     n_samples  =  len (data )
388+     if  n_samples  >=  100000  and  kernel  !=  'linear' :
389+         warnings .warn (
390+             f"Training One-Class SVM with { n_samples } { kernel }  
391+             f"For large datasets (100,000+ rows), consider using kernel='linear' for better performance. " 
392+             f"Expected training time with '{ kernel }  ,
393+             UserWarning ,
394+             stacklevel = 2 
395+         )
396+     
381397    # Initialize One-Class SVM model 
382398    model  =  OneClassSVM (
383399        nu = nu ,
@@ -394,9 +410,13 @@ def detect_anomalies_one_class_svm(
394410    # Get decision function scores: more negative = stronger anomaly 
395411    scores  =  model .decision_function (data )
396412
413+     # Count the number of detected anomalies 
414+     n_anomalies  =  int ((predictions  ==  - 1 ).sum ())
415+     
397416    # Return results as a dictionary 
398417    return  {
399418        "predictions" : predictions ,
400419        "scores" : scores ,
401-         "model" : model 
420+         "model" : model ,
421+         "n_anomalies" : n_anomalies 
402422    }
0 commit comments