# Anomaly Detection in Practice

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder, StandardScaler
import time
np.random.seed(42)

# import the outlier detection toolkit
# install it with 
# ! pip install --upgrade pyod 
import pyod

In [2]:
# load data set
content = open('../../../../../Desktop/kdd-cup-1999-data/kddcup.names', 'r').readlines()

buf, *features = content

features_types_dict = {f.split(':')[0]: f.split(':')[1][1:-2] for f in features}
features = list(features_types_dict.keys())

content = open('../../../../../Desktop/kdd-cup-1999-data/training_attack_types', 'r').readlines()

buf = content[:-1]

attack_types_dict = {line.split()[0]: line.split()[1] for line in buf}
attack_types_dict['normal'] = 'normal'

# Load data into df
data_file = '../../../../../Desktop/kdd-cup-1999-data/kddcup.data.gz'

data = pd.read_csv(data_file, 
                      header=None, 
                      names=features + ['label'])
data['label'] = [i[:-1] for i in data['label'].values]

# select 'smtp' as service

ds = data[data.service == 'smtp'].sample(frac=1).reset_index(drop=True)

label_dict = {
    'normal': 0,
    'neptune': 1,
    'satan': 1,
    'nmap':1,
    'portsweep': 1,
    'ipsweep': 1
}

ds['label'] = [label_dict[item] for item in ds['label']]

X, y = ds.drop('label', axis = 1), ds.label


# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 95371, 1: 1183})


In [3]:
numerical_columns_selector = make_column_selector(dtype_exclude=object)
num_features = numerical_columns_selector(X)

categorical_columns_selector = make_column_selector(dtype_include=object)
cat_features = categorical_columns_selector(X)



for feat in num_features:
        scaler = StandardScaler()
        X[feat] = scaler.fit_transform(np.array(X[feat]).reshape(-1, 1))
for feat in cat_features:
        encoder = LabelEncoder()
        X[feat] = encoder.fit_transform(np.array(X[feat]).reshape(-1, 1))

  return f(*args, **kwargs)


In [4]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
f1 = f1_score(y_test, yhat)
auc = roc_auc_score(y_test, yhat)
print('F-1: {}\nROC_AUC: {}'.format(f1, auc))

(64691, 41) (64691,)
F-1: 0.9922077922077923
ROC_AUC: 0.9973322989135613


# Histogram-based Outlier Detection (HBOS)
*(from pyod)*

In [5]:
from pyod.models.hbos import HBOS

contamination = 0.1
hbos = HBOS(contamination=contamination)

# fit the data to HBOS
start = time.time()
hbos.fit(X_train)
end = time.time()
y_hat = hbos.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_hbos, in_hbos = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_hbos, in_hbos))

X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_hbos = f1_score(y_test, y_pred)
auc_hbos = roc_auc_score(y_test, y_pred)
time_hbos = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_hbos, auc_hbos, time_hbos))

Removed 58222 outliers, kept 6469 inliers
F-1: 0.9883570504527813
ROC_AUC: 0.9972846480987324
Time (s): 2.154905080795288


# Density-Based Spatial Clustering of Applications with Noise(DBSCAN)
*(from scikit-learn)*

In [6]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.1, min_samples=2, metric='cosine')

# fit the data to DBSCAN
start = time.time()
y_hat = dbscan.fit_predict(X_train, y_train)
end = time.time()

# filter out predictions values = -1 
# as they are considered as anomalies
mask = y_hat != -1

out_dbscan, in_dbscan = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_dbscan, in_dbscan))

X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_dbscan = f1_score(y_test, y_pred)
auc_dbscan = roc_auc_score(y_test, y_pred)
time_dbscan = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_dbscan, auc_dbscan, time_dbscan))


Removed 13 outliers, kept 64678 inliers
F-1: 0.9934810951760104
ROC_AUC: 0.996061982790114
Time (s): 169.66550707817078


# One-Class Support Vector Machine (OCSVM)
*(from scikit-learn)*

In [7]:
from sklearn.svm import OneClassSVM as OCSVM

ocsvm = OCSVM(gamma='auto', kernel='linear')


# fit the data to OCSVM
start = time.time()
y_hat = ocsvm.fit_predict(X_train, y_train)
end = time.time()
# filter out predictions values = -1 
# as they are considered as anomalies
mask = y_hat != -1

out_ocsvm, in_ocsvm = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_ocsvm, in_ocsvm))

X_masked, y_masked = X_train[mask], y_train[mask]


model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_ocsvm = f1_score(y_test, y_pred)
auc_ocsvm = roc_auc_score(y_test, y_pred)
time_ocsvm = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_ocsvm, auc_ocsvm, time_ocsvm))

Removed 32355 outliers, kept 32336 inliers
F-1: 0.896797153024911
ROC_AUC: 0.9909009279996188
Time (s): 536.9208903312683


# IsolationForest Outlier Detector 
*(from pyod also on scikit-learn)*

In [8]:
from pyod.models.iforest import IForest

random_state = np.random.RandomState(42)
contamination = 0.1
iso = IForest(contamination=contamination, random_state=random_state)

# fit the data to IF
start = time.time()
iso.fit(X_train)
end = time.time()

y_hat = iso.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_iso, in_iso = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_iso, in_iso))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions

f1_iso = f1_score(y_test, y_pred)
auc_iso = roc_auc_score(y_test, y_pred)
time_iso = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_iso, auc_iso, time_iso))

Removed 58222 outliers, kept 6469 inliers
F-1: 0.9869451697127937
ROC_AUC: 0.9921239655802281
Time (s): 7.078562021255493


# Local Outlier Factor (LOF) 
*(from pyod also on scikit-learn)*

In [9]:
from pyod.models.lof import LOF

contamination = 0.1
lof = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', contamination = contamination)

# fit the data to LOF
start = time.time()
lof.fit(X_train)
end = time.time()

y_hat = lof.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_lof, in_lof = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_lof, in_lof))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_lof = f1_score(y_test, y_pred)
auc_lof = roc_auc_score(y_test, y_pred)
time_lof = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_lof, auc_lof, time_lof))

Removed 58903 outliers, kept 5788 inliers
F-1: 0.9750982961992136
ROC_AUC: 0.9842638147653991
Time (s): 90.03914904594421


# Clustering Based Local Outlier Factor (CBLOF) 
*(from pyod)*

In [10]:
from pyod.models.cblof import CBLOF

random_state = np.random.RandomState(42)
contamination = 0.1
cblof = CBLOF(contamination=contamination, check_estimator=False, random_state=random_state)

# fit the data to CBLOF
start = time.time()
cblof.fit(X_train)
end = time.time()
y_hat = cblof.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_cblof, in_cblof = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_cblof, in_cblof))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_cblof = f1_score(y_test, y_pred)
auc_cblof = roc_auc_score(y_test, y_pred)
time_cblof = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_cblof, auc_cblof, time_cblof))

Removed 58222 outliers, kept 6469 inliers
F-1: 0.9869451697127937
ROC_AUC: 0.9921239655802281
Time (s): 2.391855001449585


# ABOD
*(from pyod)*

In [11]:
from pyod.models.abod import ABOD

contamination = 0.1
abod = ABOD(contamination=contamination)

# fit the data to ABOD
start = time.time()
abod.fit(X_train)
end = time.time()

y_hat = abod.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_abod, in_abod = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_abod, in_abod))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_abod = f1_score(y_test, y_pred)
auc_abod = roc_auc_score(y_test, y_pred)
time_abod = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_abod, auc_abod, time_abod))

Removed 58034 outliers, kept 6657 inliers
F-1: 0.9869451697127937
ROC_AUC: 0.9921239655802281
Time (s): 112.96911573410034


# Feaure Bagging 
*(from pyod)*

In [12]:
from pyod.models.feature_bagging import FeatureBagging

random_state = np.random.RandomState(42)
contamination = 0.1
fbd = FeatureBagging(LOF(n_neighbors=20),contamination=contamination,
                    check_estimator=False,random_state=random_state)

# fit the data to FB
start = time.time()
fbd.fit(X_train)
end = time.time()

y_hat = fbd.predict(X_train)

# filter out predictions values = 0
# as they are considered as anomalies
mask = y_hat != 0

out_fb, in_fb = Counter(mask)[0],Counter(mask)[1]

print('Removed {} outliers, kept {} inliers'.format(out_fb, in_fb))


X_masked, y_masked = X_train[mask], y_train[mask]

model = DecisionTreeClassifier()
# fit the model
model.fit(X_masked, y_masked)
# evaluate the model
y_pred = model.predict(X_test)
# evaluate predictions
f1_fb = f1_score(y_test, y_pred)
auc_fb = roc_auc_score(y_test, y_pred)
time_fb = end-start
print('F-1: {}\nROC_AUC: {}\nTime (s): {}'.format(f1_fb, auc_fb, time_fb))

Removed 59093 outliers, kept 5598 inliers
F-1: 0.9736147757255936
ROC_AUC: 0.9803893319752851
Time (s): 930.2121160030365


# Summarize results

In [13]:
cols = ['Detector', 'Outliers', 'Inliers', 'Time (s)', 'F1', 'ROC_AUC']

detectors = ['None', 'OCSVM', 'ABOD', 'CBLOF', 'DBSCAN', 'FB', 'IF', 'HBOS', 'LOF']

aucs = [auc, auc_ocsvm, auc_abod, auc_cblof, auc_dbscan, auc_fb, auc_iso, auc_hbos, auc_lof]

f1s = [f1, f1_ocsvm, f1_abod, f1_cblof, f1_dbscan, f1_fb, f1_iso, f1_hbos, f1_lof]

times = [np.NaN, time_ocsvm, time_abod, time_cblof, time_dbscan, time_fb, time_iso, time_hbos, time_lof]

inliers = [np.NaN, in_ocsvm, in_abod, in_cblof, in_dbscan, in_fb, in_iso, in_hbos, in_lof]

outliers = [np.NaN, out_ocsvm, out_abod, out_cblof, out_dbscan, out_fb, out_iso, out_hbos, out_lof]

df = pd.DataFrame(columns=cols)
df.Detector = detectors
df.Outliers = outliers
df.Inliers = inliers
df.F1 = f1s
df.ROC_AUC = aucs
df['Time (s)'] = times


In [14]:
df

Unnamed: 0,Detector,Outliers,Inliers,Time (s),F1,ROC_AUC
0,,,,,0.992208,0.997332
1,OCSVM,32355.0,32336.0,536.92089,0.896797,0.990901
2,ABOD,58034.0,6657.0,112.969116,0.986945,0.992124
3,CBLOF,58222.0,6469.0,2.391855,0.986945,0.992124
4,DBSCAN,13.0,64678.0,169.665507,0.993481,0.996062
5,FB,59093.0,5598.0,930.212116,0.973615,0.980389
6,IF,58222.0,6469.0,7.078562,0.986945,0.992124
7,HBOS,58222.0,6469.0,2.154905,0.988357,0.997285
8,LOF,58903.0,5788.0,90.039149,0.975098,0.984264
