In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from scipy import stats
from scipy.spatial import distance
import random
random.seed(8888)
def iForest(df):
    clf = IsolationForest(max_samples=1000, contamination=0.05,random_state= 5, bootstrap=False)
    clf.fit(df)
    y_pred = clf.predict(df)
    anomalies = df[y_pred == -1]
    return anomalies

def LOF(df):
    clf = LocalOutlierFactor(n_neighbors=20, contamination = 0.05)
    y_pred = clf.fit_predict(df)
    anomalies = df[y_pred == -1]
    return anomalies

In [3]:
mon_normalized = pd.read_csv('~/Desktop/data/normalized/data02_13_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]
tue_normalized = pd.read_csv('~/Desktop/data/normalized/data02_14_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]
wed_normalized = pd.read_csv('~/Desktop/data/normalized/data02_15_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]
thur_normalized = pd.read_csv('~/Desktop/data/normalized/data02_16_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]
fri_normalized = pd.read_csv('~/Desktop/data/normalized/data02_17_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]

## Anomalous data - iForest
mon_norm_anom = iForest(mon_normalized)
tue_norm_anom = iForest(tue_normalized)
wed_norm_anom = iForest(wed_normalized)
thur_norm_anom = iForest(thur_normalized)
fri_norm_anom = iForest(fri_normalized)

## Anomalous data - LOF
mon_norm_anom_ = LOF(mon_normalized)
tue_norm_anom_ = LOF(tue_normalized)
wed_norm_anom_ = LOF(wed_normalized)
thur_norm_anom_ = LOF(thur_normalized)
fri_norm_anom_ = LOF(fri_normalized)

In [4]:
## KS-tests
ks_mon_bytes = stats.ks_2samp(mon_norm_anom.bytes, mon_norm_anom_.bytes)[0]
ks_mon_pkts = stats.ks_2samp(mon_norm_anom.pkts, mon_norm_anom_.pkts)[0]
ks_mon_dur = stats.ks_2samp(mon_norm_anom.dur, mon_norm_anom_.dur)[0]
ks_mon_rate = stats.ks_2samp(mon_norm_anom.rate, mon_norm_anom_.rate)[0]

ks_tue_bytes = stats.ks_2samp(tue_norm_anom.bytes, tue_norm_anom_.bytes)[0]
ks_tue_pkts = stats.ks_2samp(tue_norm_anom.pkts, tue_norm_anom_.pkts)[0]
ks_tue_dur = stats.ks_2samp(tue_norm_anom.dur, tue_norm_anom_.dur)[0]
ks_tue_rate = stats.ks_2samp(tue_norm_anom.rate, tue_norm_anom_.rate)[0]

ks_wed_bytes = stats.ks_2samp(wed_norm_anom.bytes, wed_norm_anom_.bytes)[0]
ks_wed_pkts = stats.ks_2samp(wed_norm_anom.pkts, wed_norm_anom_.pkts)[0]
ks_wed_dur = stats.ks_2samp(wed_norm_anom.dur, wed_norm_anom_.dur)[0]
ks_wed_rate = stats.ks_2samp(wed_norm_anom.rate, wed_norm_anom_.rate)[0]

ks_thur_bytes = stats.ks_2samp(thur_norm_anom.bytes, thur_norm_anom_.bytes)[0]
ks_thur_pkts = stats.ks_2samp(thur_norm_anom.pkts, thur_norm_anom_.pkts)[0]
ks_thur_dur = stats.ks_2samp(thur_norm_anom.dur, thur_norm_anom_.dur)[0]
ks_thur_rate = stats.ks_2samp(thur_norm_anom.rate, thur_norm_anom_.rate)[0]

ks_fri_bytes = stats.ks_2samp(fri_norm_anom.bytes, fri_norm_anom_.bytes)[0]
ks_fri_pkts = stats.ks_2samp(fri_norm_anom.pkts, fri_norm_anom_.pkts)[0]
ks_fri_dur = stats.ks_2samp(fri_norm_anom.dur, fri_norm_anom_.dur)[0]
ks_fri_rate = stats.ks_2samp(fri_norm_anom.rate, fri_norm_anom_.rate)[0]

In [5]:
ks_mon_bytes

0.63636363636363635

In [6]:
ks_mon_pkts

0.60287081339712922

In [7]:
ks_mon_dur

0.53110047846889952

In [8]:
ks_mon_rate

0.63636363636363624

In [9]:
ks_tue_bytes

0.76901408450704223

In [10]:
ks_tue_pkts

0.74366197183098592

In [11]:
ks_tue_dur

0.52394366197183095

In [12]:
ks_tue_rate

0.76901408450704223

In [13]:
ks_wed_bytes

0.74535809018567645

In [14]:
ks_wed_pkts

0.70822281167108758

In [15]:
ks_wed_dur

0.43766578249336874

In [16]:
ks_wed_rate

0.74535809018567645

In [17]:
ks_thur_bytes

0.87846481876332627

In [18]:
ks_thur_pkts

0.74200426439232414

In [19]:
ks_thur_dur

0.41791044776119413

In [20]:
ks_thur_rate

0.87846481876332627

In [21]:
ks_fri_bytes

0.75899280575539563

In [22]:
ks_fri_pkts

0.7158273381294965

In [23]:
ks_fri_dur

0.59712230215827344

In [24]:
ks_fri_rate

0.75899280575539563