# Anomaly Detection in Practice

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder, StandardScaler

np.random.seed(42)

# import the outlier detection toolkit
# install it with 
# ! pip install --upgrade pyod 
import pyod

In [2]:
# load the data set 
# This is a 10% stratified subsample of the data from the 1999 ACM KDD Cup
# For more info, please see https://www.openml.org/d/1113

url = 'https://datahub.io/machine-learning/kddcup99/r/kddcup99.csv'

kdd = pd.read_csv(url)


In [3]:
ds = kdd[kdd.service == 'smtp'].sample(frac=1).reset_index(drop=True)

label_dict = {
    'normal': 0,
    'neptune': 1,
    'satan': 1,
    'portsweep': 1,
    'ipsweep': 1
}
ds['label'] = [label_dict[item] for item in ds['label']]

X, y = ds.drop('label', axis = 1), ds.label


# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 9598, 1: 125})


In [4]:
numerical_columns_selector = make_column_selector(dtype_exclude=object)
num_features = numerical_columns_selector(X)

categorical_columns_selector = make_column_selector(dtype_include=object)
cat_features = categorical_columns_selector(X)



for feat in num_features:
        scaler = StandardScaler()
        X[feat] = scaler.fit_transform(np.array(X[feat]).reshape(-1, 1))
for feat in cat_features:
        encoder = LabelEncoder()
        X[feat] = encoder.fit_transform(np.array(X[feat]).reshape(-1, 1))

  return f(*args, **kwargs)


In [5]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
f1 = f1_score(y_test, yhat)
auc = roc_auc_score(y_test, yhat)
print('F-1: {}\nROC_AUC: {}'.format(f1, auc))

(6514, 41) (6514,)
F-1: 0.9583333333333334
ROC_AUC: 0.9788503110829907


# Histogram-based Outlier Detection (HBOS)
*(from pyod)*

In [6]:
from pyod.models.hbos import HBOS

contamination = 0.1
hbos = HBOS(contamination=contamination)

# fit the data to HBOS
hbos.fit(X_train)

y_hat = hbos.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_hbos, in_hbos = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_hbos, in_hbos))

X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_hbos = f1_score(y_test, y_pred)
auc_hbos = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_hbos, auc_hbos))

Removed 5862 outliers, kept 652 inliers
F-1: 0.9565217391304348
ROC_AUC: 0.9583333333333333


# Density-Based Spatial Clustering of Applications with Noise(DBSCAN)
*(from sklearn)*

In [7]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.1, min_samples=2, metric='cosine')

# fit the data to IF
y_hat = dbscan.fit_predict(X_train, y_train)

# filter out predictions values = -1 
# as they are considered as anomalies
mask = y_hat != -1

out_dbscan, in_dbscan = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_dbscan, in_dbscan))

X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_dbscan = f1_score(y_test, y_pred)
auc_dbscan = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_dbscan, auc_dbscan))


Removed 16 outliers, kept 6498 inliers
F-1: 0.9484536082474228
ROC_AUC: 0.9786921332911527


# One-Class Support Vector Machine (OCSVM)
*(from scikit-learn)*

In [8]:
from sklearn.svm import OneClassSVM as OCSVM

ocsvm = OCSVM(gamma='auto', kernel='linear')


# fit the data to OCSVM
y_hat = ocsvm.fit_predict(X_train, y_train)

# filter out predictions values = -1 
# as they are considered as anomalies
mask = y_hat != -1

out_ocsvm, in_ocsvm = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_ocsvm, in_ocsvm))

X_masked, y_masked = X_train[mask], y_train[mask]


model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_ocsvm = f1_score(y_test, y_pred)
auc_ocsvm = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_ocsvm, auc_ocsvm))

Removed 4476 outliers, kept 2038 inliers
F-1: 0.9320388349514563
ROC_AUC: 0.9988927554571337


# IsolationForest Outlier Detector 
*(from pyod also on scikit-learn)*

In [9]:
from pyod.models.iforest import IForest

random_state = np.random.RandomState(42)
contamination = 0.1
iso = IForest(contamination=contamination, random_state=random_state)

# fit the data to IF
iso.fit(X_train)

y_hat = iso.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_iso, in_iso = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_iso, in_iso))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions

f1_iso = f1_score(y_test, y_pred)
auc_iso = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_iso, auc_iso))

Removed 5862 outliers, kept 652 inliers
F-1: 0.9565217391304348
ROC_AUC: 0.9583333333333333


# Local Outlier Factor (LOF) 
*(from pyod also on scikit-learn)*

In [10]:
from pyod.models.lof import LOF

contamination = 0.1
lof = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', contamination = contamination)

# fit the data to LOF
lof.fit(X_train)

y_hat = lof.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_lof, in_lof = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_lof, in_lof))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_lof = f1_score(y_test, y_pred)
auc_lof = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_lof, auc_lof))

Removed 5948 outliers, kept 566 inliers
F-1: 0.888888888888889
ROC_AUC: 0.9981018664979437


# Clustering Based Local Outlier Factor (CBLOF) 
*(from pyod)*

In [11]:
from pyod.models.cblof import CBLOF

random_state = np.random.RandomState(42)
contamination = 0.1
cblof = CBLOF(contamination=contamination, check_estimator=False, random_state=random_state)

# fit the data to CBLOF
cblof.fit(X_train)

y_hat = cblof.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_cblof, in_cblof = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_cblof, in_cblof))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_cblof = f1_score(y_test, y_pred)
auc_cblof = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_cblof, auc_cblof))

Removed 5862 outliers, kept 652 inliers
F-1: 0.9565217391304348
ROC_AUC: 0.9583333333333333


# ABOD
*(from pyod)*

In [12]:
from pyod.models.abod import ABOD

contamination = 0.1
abod = ABOD(contamination=contamination)

# fit the data to ABOD
abod.fit(X_train)

y_hat = abod.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_abod, in_abod = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_abod, in_abod))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_abod = f1_score(y_test, y_pred)
auc_abod = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_abod, auc_abod))

Removed 5828 outliers, kept 686 inliers
F-1: 0.9896907216494846
ROC_AUC: 0.999841822208162


# Feaure Bagging 
*(from pyod)*

In [13]:
from pyod.models.feature_bagging import FeatureBagging

random_state = np.random.RandomState(42)
contamination = 0.1
fbd = FeatureBagging(LOF(n_neighbors=20),contamination=contamination,
                    check_estimator=False,random_state=random_state)

# fit the data to FB
fbd.fit(X_train)

y_hat = fbd.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_fb, in_fb = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_fb, in_fb))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_fb = f1_score(y_test, y_pred)
auc_fb = roc_auc_score(y_test, y_pred)
print('F-1: {}\nROC_AUC: {}'.format(f1_fb, auc_fb))

Removed 5950 outliers, kept 564 inliers
F-1: 0.888888888888889
ROC_AUC: 0.9981018664979437


# Summarize results

In [14]:
cols = ['Detector', 'Outliers', 'Inliers', 'F1', 'ROC_AUC']

df = pd.DataFrame(columns=cols)

detectors = ['None', 'OCSVM', 'ABOD', 'CBLOF', 'DBSCAN', 'FB', 'IF', 'HBOS', 'LOF']

aucs = [auc, auc_ocsvm, auc_abod, auc_cblof, auc_dbscan, auc_fb, auc_iso, auc_hbos, auc_lof]

f1s = [f1, f1_ocsvm, f1_abod, f1_cblof, f1_dbscan, f1_fb, f1_iso, f1_hbos, f1_lof]

inliers = [np.NaN, in_ocsvm, in_abod, in_cblof, in_dbscan, in_fb, in_iso, in_hbos, in_lof]

outliers = [np.NaN, out_ocsvm, out_abod, out_cblof, out_dbscan, out_fb, out_iso, out_hbos, out_lof]

df.Detector = detectors
df.Outliers = outliers
df.Inliers = inliers
df.F1 = f1s
df.ROC_AUC = aucs


In [15]:
df

Unnamed: 0,Detector,Outliers,Inliers,F1,ROC_AUC
0,,,,0.958333,0.97885
1,OCSVM,4476.0,2038.0,0.932039,0.998893
2,ABOD,5828.0,686.0,0.989691,0.999842
3,CBLOF,5862.0,652.0,0.956522,0.958333
4,DBSCAN,16.0,6498.0,0.948454,0.978692
5,FB,5950.0,564.0,0.888889,0.998102
6,IF,5862.0,652.0,0.956522,0.958333
7,HBOS,5862.0,652.0,0.956522,0.958333
8,LOF,5948.0,566.0,0.888889,0.998102
