In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from scipy import stats
from scipy.spatial import distance
import random
random.seed(8888)

In [2]:
# KSE 
def euc_dis(p1,p2):
    a = p1.values
    b = p2.values
    c = np.vstack((a,b))
    dist = distance.pdist(c)
    return float(dist)
def KSE_Test(df, sample_size):
    # initiate two sample dfs, one distance matrix, and one kse score.
    # generate 2 random samples.
    sample1 = df.sample(n = sample_size, replace = False)
    sample2 = df.sample(n = sample_size, replace = False)
    kse_score = [] # empty list to store KSE scores
#   distance_matrix = np.matrix([[None] * int(len(df) * fraction) , 
#                               [None] * int(len(df) * fraction)])
#     distance_matrix = [[0 for x in range(len(sample1))] 
#                        for y in range(len(sample2))] 
    distance_matrix = np.zeros((sample_size, sample_size))
    # calculate distance from all points in sample1 to all points in sample2
    for i in range(0,sample_size):
        for j in range(0,sample_size):
            distance_matrix[i][j] = euc_dis(sample1.iloc[i,],sample2.iloc[j,])
    # calculate average KS-score for each instance in df.
    for i in range(0, len(df)):
        observed_instance = df.iloc[i,]
        distance_array = [0] * sample_size
        for j in range(0,sample_size):
            distance_array[j] = euc_dis(observed_instance,sample2.iloc[j,])
        sum = 0
        for k in range(0, sample_size):
            ks_score = stats.ks_2samp(distance_array, distance_matrix[k])[0]
            sum += ks_score
        average = sum/sample_size
        #kse_score[i] = average
        kse_score.append((i, average))
#         if (i%1000 == 0):
#             print('loop %s' % i)
    print("done")
    return kse_score 

def iForest(df):
    clf = IsolationForest(max_samples=1000, contamination=0.05,random_state= 5, bootstrap=False)
    clf.fit(df)
    y_pred = clf.predict(df)
    anomalies = df[y_pred == -1]
    return anomalies

def final_result(anom_df):
    kse_result = KSE_Test(anom_df, int(0.5 * len(anom_df)))
    index_anom = [] 
    for i in kse_result:
        if i[1] > 0.5:
            index_anom.append(i[0])
    # final result is a data set of the anomalies of the anomalies
    final = anom_df.iloc[index_anom]
    return final

In [6]:
# run iForest approach for raw, and normalized data for Monday morning for Pages, to see if results are the same.
mon = pd.read_csv('~/Desktop/data/data02_13_morning_Pages', sep = ',')[["bytes","pkts","dur","rate"]]
mon_normalized = pd.read_csv('~/Desktop/data/normalized_data/data02_13_morning_Pages_normalized', sep = ',')[["bytes","pkts","dur","rate"]]

In [9]:
mon_anom = iForest(mon)
mon_normalized_anom = iForest(mon_normalized)

In [15]:
s = set(mon_anom.index.values)
t = set(mon_normalized_anom.index.values)
len(s.intersection(t))

40

In [16]:
mon_final = final_result(mon_anom)
mon_normalized_final = final_result(mon_normalized_anom)

done
done


In [17]:
x = set(mon_final.index.values)
y = set(mon_normalized_final.index.values)
len(x.intersection(y))

0

In [20]:
len(mon_final)
mon_final

Unnamed: 0,bytes,pkts,dur,rate
474,2694404,1812,0.748,28817149.5
1758,401340,280,0.136,23608208.14
1783,622972,428,0.178,27998745.77
2235,14114580,9432,5.045,22381890.64
2590,700644,484,0.283,19806191.35
3896,622988,428,0.238,20940780.49


In [21]:
len(mon_normalized_final)
mon_normalized_final

Unnamed: 0,bytes,pkts,dur,rate
544,0.239904,0.005579,0.000689,0.97078
640,0.239904,0.005452,0.000898,0.970781
744,0.178343,0.004053,0.00049,0.98396
