In [1]:
"""Anomaly Detection Example"""
from __future__ import print_function
from IPython.display import display, HTML

from matplotlib import pyplot as plt
%matplotlib inline
import os
import sys
import argparse
import math
from collections import Counter
import ipaddress
import seaborn as sns
# Third Party Imports
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.externals import joblib
# Local imports
from zat import log_to_dataframe
from zat import dataframe_to_matrix



## IESCO 'DNS' data is being loaded
##### total length of data is 5380286 rows

In [2]:
pd.set_option('display.width', 1000)
bro_log = 'http'
bro_df = pd.read_csv('../IESCO_dec_feb/dns.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
bro_df=bro_df[['id.orig_h','id.resp_h','ts','Z','id.orig_p','id.resp_p', 'proto', 'qtype_name','query','answers']].dropna()
bro_df.head(10)

Unnamed: 0,id.orig_h,id.resp_h,ts,Z,id.orig_p,id.resp_p,proto,qtype_name,query,answers
49,fe80::d501:66f7:e7df:1ae4,ff02::fb,2019-12-24 08:39:44.642102003,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"
50,172.16.4.56,224.0.0.251,2019-12-24 08:39:44.638959885,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"
51,fe80::d501:66f7:e7df:1ae4,ff02::fb,2019-12-24 08:39:44.651731968,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"
52,172.16.4.56,224.0.0.251,2019-12-24 08:39:44.651254892,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"
346,172.16.4.50,224.0.0.251,2019-12-24 08:46:39.509027958,0,5353,5353,udp,*,desktop-g99het0.local,"fe80::440c:299f:2ffa:335b,172.16.4.50"
347,fe80::440c:299f:2ffa:335b,ff02::fb,2019-12-24 08:46:39.511996031,0,5353,5353,udp,*,desktop-g99het0.local,"fe80::440c:299f:2ffa:335b,172.16.4.50"
348,fe80::440c:299f:2ffa:335b,ff02::fb,2019-12-24 08:46:39.516772032,0,5353,5353,udp,*,desktop-g99het0.local,"fe80::440c:299f:2ffa:335b,172.16.4.50"
349,172.16.4.50,224.0.0.251,2019-12-24 08:46:39.515398025,0,5353,5353,udp,*,desktop-g99het0.local,"fe80::440c:299f:2ffa:335b,172.16.4.50"
631,fe80::d501:66f7:e7df:1ae4,ff02::fb,2019-12-24 08:54:44.705049992,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"
632,172.16.4.56,224.0.0.251,2019-12-24 08:54:44.703833103,0,5353,5353,udp,*,shakilkhan.local,"fe80::d501:66f7:e7df:1ae4,172.16.4.56"


In [4]:
def entropy(string):
    """Compute entropy on the string"""
    try:
        p, lns = Counter(string), float(len(string))
        return -sum(count/lns * math.log(count/lns, 2) for count in p.values())
    except Exception as e:
        return pd.np.nan
def ip_embedding_ext(ip):
    transformed = None
    try:
        transformed = int(ipaddress.ip_address(ip))
        return transformed
    except:
        return transformed

In [5]:
bro_df=bro_df[bro_df['proto'].isin(['tcp','udp','icmp'])]
bro_df=bro_df[bro_df['Z'].isin([0,1])]

bro_df['id.orig_h_e']=bro_df['id.orig_h'].apply(lambda x : ip_embedding_ext(x))
bro_df['id.resp_p'] = pd.to_numeric(bro_df['id.resp_p'], errors='coerce')
bro_df['id.resp_p'] = bro_df['id.resp_p'].fillna(0)

bro_df['id.orig_p'] = pd.to_numeric(bro_df['id.orig_p'], errors='coerce')
bro_df['id.orig_p'] = bro_df['id.orig_p'].fillna(0)

bro_df['id.orig_h_e'] = pd.to_numeric(bro_df['id.orig_h_e'], errors='coerce')
bro_df['id.orig_h_e'] = bro_df['id.orig_h_e'].fillna(0)

In [6]:
bro_log ='dns'

In [7]:
bro_df['Z'].value_counts()

0    8044
Name: Z, dtype: int64

In [8]:
# File may have a tilde in it
if bro_log:
    bro_log = os.path.expanduser(bro_log)
    # Sanity check either http or dns log
    if 'http' in bro_log:
        log_type = 'http'
        features = ['id.orig_h_e','id.orig_p','id.resp_p', 'method', 'resp_mime_types', 'request_body_len']
    elif 'dns' in bro_log:
        log_type = 'dns'
        features = ['id.orig_h_e','Z', 'proto', 'qtype_name','id.orig_p','id.resp_p', 'query_length', 'answer_length', 'entropy']
    else:
        print('This example only works with Zeek with http.log or dns.log files..')
        sys.exit(1)

    # Create a Pandas dataframe from a Zeek log
    try:
        log_to_df = log_to_dataframe.LogToDataFrame()
        #bro_df = pd.read_csv('http_Farrukh-Naveed-Anjum.csv')#log_to_df.create_dataframe(bro_log)
        #print(bro_df.head())
    except IOError:
        print('Could not open or parse the specified logfile: %s' % bro_log)
        sys.exit(1)
    print('Read in {:d} Rows...'.format(len(bro_df)))

    # Using Pandas we can easily and efficiently compute additional data metrics
    # Here we use the vectorized operations of Pandas/Numpy to compute query length
    # We'll also compute entropy of the query
    if log_type == 'dns':
        bro_df['query_length'] = bro_df['query'].str.len()
        bro_df['answer_length'] = bro_df['answers'].str.len()
        bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))
    # Use the zat DataframeToMatrix class
    to_matrix = dataframe_to_matrix.DataFrameToMatrix()

Read in 8044 Rows...


In [9]:
    bro_matrix = to_matrix.fit_transform(bro_df[features])
    print(bro_matrix.shape)

Changing column Z to category...
Changing column proto to category...
Changing column qtype_name to category...
Normalizing column id.orig_h_e...
Normalizing column id.orig_p...
Normalizing column id.resp_p...
Normalizing column query_length...
Normalizing column answer_length...
Normalizing column entropy...
(8044, 12)


In [10]:
# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(behaviour='new',contamination=0.005,n_jobs=-1,verbose=0)  # Marking 20% as odd
odd_clf.fit(bro_matrix)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.005,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=-1, random_state=None, verbose=0, warm_start=False)

In [11]:
dns_pickle = {'model':odd_clf,'tranformer':to_matrix}

In [12]:
dns_pickle

{'model': IsolationForest(behaviour='new', bootstrap=False, contamination=0.005,
                 max_features=1.0, max_samples='auto', n_estimators=100,
                 n_jobs=-1, random_state=None, verbose=0, warm_start=False),
 'tranformer': <zat.dataframe_to_matrix.DataFrameToMatrix at 0x24077cfde48>}

In [13]:
with open('iiesco_dns_iforest_v1.pkl', 'wb') as pickle_file:
    joblib.dump(dns_pickle, pickle_file)

In [14]:
#odd_clf = joblib.load('Farukh.pkl')

In [15]:
# Now we create a new dataframe using the prediction from our classifier
predictions = odd_clf.predict(bro_matrix)

In [16]:
odd_df = bro_df[features][predictions == -1]
display_df = bro_df[predictions == -1]

In [17]:
# Now we're going to explore our odd observations with help from KMeans
odd_matrix = to_matrix.fit_transform(odd_df)
num_clusters = min(len(odd_df), 4)  # 4 clusters unless we have less than 4 observations
display_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)
print(odd_matrix.shape)
# display(HTML(display_df.head(50).to_html()));

Changing column Z to category...
Changing column proto to category...
Changing column qtype_name to category...
Normalizing column id.orig_h_e...
Normalizing column id.orig_p...
Normalizing column id.resp_p...
Normalizing column query_length...
Normalizing column answer_length...
Normalizing column entropy...
(40, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [18]:
pd.set_option('display.max_rows', 100)
outputdf = pd.DataFrame()
group = pd.DataFrame()
# Now group the dataframe by cluster
if log_type == 'dns':
    features += ['query']
    features+=['ts']  
    features += ['id.orig_h']
    features += ['answers']
else:
    features += ['host']
    features += ['uri']
    features+=['ts']
    
cluster_groups = display_df[features+['cluster']].groupby('cluster')

# Now print out the details for each cluster
print('<<< Outliers Detected! >>>')
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    group = group[['ts','id.orig_h','Z', 'proto', 'qtype_name','id.orig_p','id.resp_p','query_length','answer_length','entropy','query','answers','cluster']]
    outputdf = pd.concat([outputdf,group],axis=0)
display(HTML(outputdf.to_html()))

<<< Outliers Detected! >>>

Cluster 0: 7 observations

Cluster 1: 25 observations

Cluster 2: 5 observations

Cluster 3: 3 observations


Unnamed: 0,ts,id.orig_h,Z,proto,qtype_name,id.orig_p,id.resp_p,query_length,answer_length,entropy,query,answers,cluster
26829,2020-01-01 10:06:52.847625017,fe80::aecc:97dd:3f89:a08d,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d",0
26836,2020-01-01 10:06:53.848540068,fe80::aecc:97dd:3f89:a08d,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d",0
26837,2020-01-01 10:06:55.849909067,fe80::aecc:97dd:3f89:a08d,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d",0
76815,2020-01-06 07:33:03.904728889,fe80::743e:a5df:e3e0:5fa2,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2",0
76832,2020-01-06 07:33:48.544361115,fe80::743e:a5df:e3e0:5fa2,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2",0
76846,2020-01-06 07:34:33.728427887,fe80::743e:a5df:e3e0:5fa2,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2",0
76863,2020-01-06 07:35:18.497914076,fe80::743e:a5df:e3e0:5fa2,0,udp,PTR,5353,5353,15,55,3.056565,_ipp._tcp.local,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2",0
261020,2020-01-22 21:13:15.546655893,172.16.2.211,0,udp,A,58263,53,23,150,3.795089,ctldl.windowsupdate.com,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.91.126,8.241.78.126,8.241.80.126,8.241.90.126,8.241.80.254",1
325696,2020-01-25 09:14:01.900773048,172.16.2.211,0,udp,A,49853,53,23,127,3.795089,ctldl.windowsupdate.com,"audownload.windowsupdate.nsatc.net,au.download.windowsupdate.com.hwcdn.net,cds.d2s7q6s2.hwcdn.net,205.185.216.10,205.185.216.42",1
331218,2020-01-25 12:49:35.744204044,172.16.2.210,0,udp,A,51147,53,23,153,3.795089,ctldl.windowsupdate.com,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,67.27.233.126,8.241.78.126,8.253.207.120,8.248.141.254,8.253.95.249",1


In [19]:
# Now group the dataframe by cluster
if log_type == 'dns':
    features += ['query']
    features+=['ts']  
    features += ['id.orig_h']
    features += ['answers']




else:
    features += ['host']
    features += ['uri']
    features+=['ts']


cluster_groups = display_df[features+['cluster']].groupby('cluster')

# Now print out the details for each cluster
print('<<< Outliers Detected! >>>')
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    group = group.T.drop_duplicates().T
    display(HTML(group.head(100).to_html()))

<<< Outliers Detected! >>>

Cluster 0: 7 observations


Unnamed: 0,id.orig_h_e,Z,proto,qtype_name,id.orig_p,query_length,answer_length,entropy,query,ts,id.orig_h,answers
26829,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-01 10:06:52.847625017,fe80::aecc:97dd:3f89:a08d,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d"
26836,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-01 10:06:53.848540068,fe80::aecc:97dd:3f89:a08d,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d"
26837,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-01 10:06:55.849909067,fe80::aecc:97dd:3f89:a08d,"cogitopython-virtualbox.local,fe80::aecc:97dd:3f89:a08d"
76815,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-06 07:33:03.904728889,fe80::743e:a5df:e3e0:5fa2,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2"
76832,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-06 07:33:48.544361115,fe80::743e:a5df:e3e0:5fa2,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2"
76846,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-06 07:34:33.728427887,fe80::743e:a5df:e3e0:5fa2,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2"
76863,3.3828899999999997e+38,0,udp,PTR,5353,15,55,3.05656,_ipp._tcp.local,2020-01-06 07:35:18.497914076,fe80::743e:a5df:e3e0:5fa2,"cogitopython-virtualbox.local,fe80::743e:a5df:e3e0:5fa2"



Cluster 1: 25 observations


Unnamed: 0,id.orig_h_e,Z,proto,qtype_name,id.orig_p,id.resp_p,query_length,answer_length,entropy,query,ts,id.orig_h,answers,cluster
261020,2886730000.0,0,udp,A,58263,53,23,150,3.79509,ctldl.windowsupdate.com,2020-01-22 21:13:15.546655893,172.16.2.211,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.91.126,8.241.78.126,8.241.80.126,8.241.90.126,8.241.80.254",1
325696,2886730000.0,0,udp,A,49853,53,23,127,3.79509,ctldl.windowsupdate.com,2020-01-25 09:14:01.900773048,172.16.2.211,"audownload.windowsupdate.nsatc.net,au.download.windowsupdate.com.hwcdn.net,cds.d2s7q6s2.hwcdn.net,205.185.216.10,205.185.216.42",1
331218,2886730000.0,0,udp,A,51147,53,23,153,3.79509,ctldl.windowsupdate.com,2020-01-25 12:49:35.744204044,172.16.2.210,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,67.27.233.126,8.241.78.126,8.253.207.120,8.248.141.254,8.253.95.249",1
348683,2886730000.0,0,udp,A,51465,53,23,152,3.79509,ctldl.windowsupdate.com,2020-01-26 00:49:37.688864946,172.16.2.210,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.79.254,67.27.159.254,8.241.123.254,8.241.9.126,67.27.157.126",1
359613,2886730000.0,0,udp,A,56792,53,23,155,3.79509,ctldl.windowsupdate.com,2020-01-26 09:14:07.366554976,172.16.2.211,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.122.126,67.27.159.126,67.26.137.254,67.27.159.254,8.241.121.126",1
424519,2886730000.0,0,udp,A,49628,53,23,152,3.79509,ctldl.windowsupdate.com,2020-01-29 00:49:46.921534061,172.16.2.210,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.248.131.254,8.241.82.254,8.241.88.254,8.241.83.126,8.247.205.126",1
437103,2886730000.0,0,udp,A,54160,53,23,153,3.79509,ctldl.windowsupdate.com,2020-01-29 09:14:23.293169022,172.16.2.211,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,67.26.139.254,8.248.119.254,8.241.89.126,8.241.123.254,8.241.91.126",1
442841,2886730000.0,0,udp,A,60336,53,23,153,3.79509,ctldl.windowsupdate.com,2020-01-29 12:49:48.289944887,172.16.2.210,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.81.254,67.26.139.254,8.241.91.126,67.26.137.254,8.248.113.254",1
456092,2886730000.0,0,udp,A,51004,53,23,151,3.79509,ctldl.windowsupdate.com,2020-01-29 21:13:54.513319015,172.16.2.211,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.89.126,8.241.91.126,8.248.141.254,8.241.88.254,8.241.81.254",1
492948,2886730000.0,0,udp,A,50618,53,23,152,3.79509,ctldl.windowsupdate.com,2020-01-30 21:13:59.969863892,172.16.2.211,"audownload.windowsupdate.nsatc.net,auto.au.download.windowsupdate.com.c.footprint.net,8.241.78.254,8.241.9.126,8.241.126.249,8.241.121.126,8.241.123.126",1



Cluster 2: 5 observations


Unnamed: 0,id.orig_h_e,Z,proto,qtype_name,id.orig_p,query_length,answer_length,entropy,query,ts,id.orig_h,answers,cluster
733,2886730000.0,0,udp,*,5353,16,24,3.15564,shakilkhan.local,2019-12-24 09:00:10.883049965,172.16.4.56,0._teamviewer._tcp.local,2
734,2886730000.0,0,udp,*,5353,16,24,3.15564,shakilkhan.local,2019-12-24 09:00:12.894203901,172.16.4.56,0._teamviewer._tcp.local,2
735,2886730000.0,0,udp,*,5353,16,24,3.15564,shakilkhan.local,2019-12-24 09:00:14.905786037,172.16.4.56,0._teamviewer._tcp.local,2
736,2886730000.0,0,udp,*,5353,16,33,3.15564,shakilkhan.local,2019-12-24 09:00:18.827792883,172.16.4.56,"shakilkhan.local,shakilkhan.local",2
738,2886730000.0,0,udp,*,5353,16,56,3.15564,shakilkhan.local,2019-12-24 09:00:19.837270975,172.16.4.56,"shakilkhan.local,shakilkhan.local,_teamviewer._tcp.local",2



Cluster 3: 3 observations


Unnamed: 0,id.orig_h_e,Z,proto,qtype_name,id.orig_p,id.resp_p,query_length,answer_length,entropy,query,ts,id.orig_h,answers,cluster
474488,2886730000.0,0,udp,A,58330,53,23,223,3.79509,ctldl.windowsupdate.com,2020-01-30 09:14:26.088066101,172.16.2.211,"audownload.windowsupdate.nsatc.net,au.download.windowsupdate.com.edgesuite.net,a767.dscg3.akamai.net,23.10.249.169,23.10.249.187,23.10.249.185,23.0.174.9,23.10.249.162,23.10.249.179,23.10.249.154,23.10.249.163,23.10.249.168",3
480173,2886730000.0,0,udp,A,56558,53,23,208,3.79509,ctldl.windowsupdate.com,2020-01-30 12:49:50.984493971,172.16.2.210,"audownload.windowsupdate.nsatc.net,au.download.windowsupdate.com.edgesuite.net,a767.dscg3.akamai.net,2.16.186.16,2.16.186.35,2.16.186.19,2.16.186.33,2.16.186.17,2.16.186.27,2.16.186.25,2.16.186.42,2.16.186.26",3
535611,2886730000.0,0,udp,A,56961,53,23,195,3.79509,ctldl.windowsupdate.com,2020-02-01 00:49:55.336641073,172.16.2.210,"audownload.windowsupdate.nsatc.net,au.au-msedge.net,elasticshed.au.au-msedge.net,edge-prod-zrhr0.env.au.au-msedge.net,afdap.au.au-msedge.net,au.c-0001.c-msedge.net,c-0001.c-msedge.net,13.107.4.50",3
