In [11]:
import pandas as pd
import numpy as np
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.feature_bagging import FeatureBagging

# Load datasets
dataset = pd.read_csv('test_num1.csv')
tracker = pd.read_csv('test_num1_tracker.csv')

# List of models to apply
models = [KNN(), IForest(), CBLOF(), HBOS(), LOF(), OCSVM(), PCA(), MCD(), FeatureBagging(base_estimator=LOF(), n_estimators=10)]

def evaluate_model(model, df, tracker, col):
    # Check if column is contaminated
    if tracker[col].eq(1).all():
        print(f"Column {col} has not been contaminated, skipping detection.")
        return

    # Fit the model on non-null data
    valid_data = df[col].dropna().to_frame()  # Drop NaNs and convert to DataFrame for fitting
    model.fit(valid_data)

    # Get non-null indices to align predictions with tracker
    non_null_indices = valid_data.index

    # Predict the outliers: 0 for normal, 1 for outlier
    predictions = np.zeros(df.shape[0])  # Initialize predictions array with zeros (consider NaN as normal)
    predictions[non_null_indices] = model.labels_  # Only update non-NaN indices

    # Tracker comparison
    actual_outliers = tracker[col] == 5
    predicted_outliers = predictions == 1

    true_positives = np.sum(predicted_outliers & actual_outliers)
    false_negatives = actual_outliers & ~predicted_outliers

    precision = true_positives / np.sum(predicted_outliers) if np.sum(predicted_outliers) > 0 else 0
    recall = true_positives / np.sum(actual_outliers) if np.sum(actual_outliers) > 0 else 0

    # Output results
    print(f"{model.__class__.__name__} on column {col}:")
    print(f" Precision: {precision:.2f}")
    print(f" Recall: {recall:.2f}")
    if np.any(false_negatives):
        print(f" Missed outliers in {col}: {df.loc[false_negatives, col].tolist()}\n")
    else:
        print(" No outliers missed.\n")

# Applying models to each numeric column
for column in dataset.columns:
    if dataset[column].dtype == np.int64 or dataset[column].dtype == np.float64:
        print(f"Processing column {column}...")
        num_nulls = dataset[column].isnull().sum()
        if num_nulls > 0:
            print(f" Column {column} has {num_nulls} null values which will be ignored for model fitting.")
        for model in models:
            evaluate_model(model, dataset, tracker, column)

Processing column X...
 Column X has 154 null values which will be ignored for model fitting.
KNN on column X:
 Precision: 0.52
 Recall: 1.00
 No outliers missed.

IForest on column X:
 Precision: 0.52
 Recall: 1.00
 No outliers missed.

CBLOF on column X:
 Precision: 0.52
 Recall: 1.00
 No outliers missed.

HBOS on column X:
 Precision: 1.00
 Recall: 0.80
 Missed outliers in X: [-43.4007710471978, -45.533692497905434, -49.719862867110606, -62.671891741751054, -59.7480675337795, -23.408138799205418, -76.96717882302401, -80.75006847463621, -28.01028369325615, -23.650805828245097, -48.64783564641969, -40.37882083926337, -41.13387117788644, -18.647646169780067, -43.20483249764212, -50.684293394257395, -70.13545704026929, -22.66444095445233, -41.54190622143173, -75.70139115264591, -42.29650804494018, -59.887571163197194, -81.18041812222665, -77.3114674335703, -32.39192141389017, -32.358231978526334, -54.1536266072315, -45.066194488684694, -22.053273976996728]

LOF on column X:
 Precision: 

  super()._check_params_vs_input(X, default_n_init=10)


OCSVM on column X:
 Precision: 0.52
 Recall: 1.00
 No outliers missed.

PCA on column X:
 Precision: 0.45
 Recall: 0.86
 Missed outliers in X: [-126.99028743780822, -92.14294719578903, -99.5684596396643, -102.3033204167784, -118.77877900168158, -129.6702774780878, -82.70945568892705, -100.03057428600846, -76.96717882302401, -80.75006847463621, -98.8264620587111, -85.77455695045373, -116.75633348519682, -93.9720747761451, -81.55043300824808, -114.15906085404411, -136.9893843066052, -75.70139115264591, -81.18041812222665, -77.3114674335703]

MCD on column X:
 Precision: 0.52
 Recall: 1.00
 No outliers missed.


ValueError: n_features is set to 1. Not in the range of [2, 2147483647).