In [1]:
"""Anomaly Detection Example"""
from __future__ import print_function
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
from matplotlib import pyplot as plt
%matplotlib inline
import os
import sys
import argparse
import math
from collections import Counter
import ipaddress
import seaborn as sns
# Third Party Imports
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.externals import joblib
# Local imports
from zat import log_to_dataframe
from zat import dataframe_to_matrix
import numpy as np
pd.set_option('display.max_rows', 500)
import swifter




## IESCO 'Conn' data is being loaded
##### Total length of data ist wo million six hundred sixty-two thousand eight hundred sixty-three records

In [2]:
pd.set_option('display.width', 1000)
bro_log = 'http'
bro_df = pd.read_csv('../../../IESCO_complete_dec_may/preprocessed_csv/conn_full.csv')

In [3]:
bro_df.head(5)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,2020-03-31 19:59:54.307151079,C8TyRZDMgWpC3Vevf,39.41.241.162,36186,172.16.2.211,8081,tcp,http,0.305089,140,...,SF,F,F,0,ShADadfF,6,460,5,405,(empty)
1,2020-03-31 19:59:56.170258045,CIGXkd4ajdDaSIaLcd,39.40.41.30,35130,172.16.2.211,8081,tcp,http,0.241368,140,...,SF,F,F,0,ShADadfF,5,408,5,405,(empty)
2,2020-03-31 19:59:56.478173971,Cil8w34oLPNQ8mrXWd,39.50.98.217,45948,172.16.2.211,8081,tcp,http,0.241572,140,...,SF,F,F,0,ShADadfF,5,408,5,405,(empty)
3,2020-03-31 19:59:56.905885935,C3YWKFgSzSfTUUvz,39.41.160.14,43330,172.16.2.211,8081,tcp,http,0.265524,140,...,SF,F,F,0,ShADadfF,5,408,5,405,(empty)
4,2020-03-31 19:59:58.210148096,C3vir25kXkLv8MVL2,39.41.194.146,56366,172.16.2.211,8081,tcp,http,0.252428,140,...,SF,F,F,0,ShADadfF,5,408,5,405,(empty)


In [4]:
# bro_df[bro_df['id.orig_h'].isin(display_df.reset_index()['SrcAddr'].values)].to_csv('raw_conn.csv',index=False)

### Loading IESCO IP List & Mapping IPS

In [5]:
ip_info = pd.read_csv('../../IESCO_IP_LIST.csv').dropna()
hostnames = list(ip_info['Source'].values)
ips = list(ip_info['IP'].values)
d = {k:v for k, v in zip(ips,hostnames)}


In [6]:
ip_info.head(10)

Unnamed: 0,Source,IP
0,Vmware Vcenter,172.16.1.50
1,Vcenter server Appliance,172.16.1.50/ port# 5480
2,Vmware Vcenter- HA user,172.16.1.50
3,Vmware Vcenter- Veeam user,172.16.1.50
4,ESXI-1- Veeam user,172.16.1.51
5,ESXI-2- Veeam user,172.16.1.52
6,ESXI-3- Veeam user,172.16.1.53
7,ESXI-4- Veeam user,172.16.1.54
8,ESXI-5- Veeam user,172.16.1.55
10,iBMC,192.168.1.51


## Helping function

1.  validate_ip
2.  traffic_direction using RFC 1918 Logic
3. entropy

In [7]:
def validate_ip(ip):
    try:
        ipaddress.ip_address(ip)
        return ip
    except:
        return np.nan

def traffic_direction(conn_row):
    # First try to use the local orig/resp fields
#     if conn_row.get('local_orig') and conn_row.get('local_resp'):
#         local_orig = conn_row['local_orig']
#         local_resp = conn_row['local_resp']
#     else:
        # Well we don't have local orig/resp fields so use RFC1918 logic
    local_orig = ipaddress.ip_address(conn_row['id.orig_h']).is_private
    local_resp = ipaddress.ip_address(conn_row['id.resp_h']).is_private

    # Determine north/south or internal traffic
    if (not local_orig) and local_resp:
        return 'incoming'
    if local_orig and not local_resp:
        return 'outgoing'

    # Neither host is in the allocated private ranges
    if ipaddress.ip_address(conn_row['id.orig_h']).is_multicast or \
       ipaddress.ip_address(conn_row['id.resp_h']).is_multicast:
        return 'multicast'

    # Both hosts are internal
    return 'internal'

def entropy(string):
    """Compute entropy on the string"""
    try:
        p, lns = Counter(string), float(len(string))
        return -sum(count/lns * math.log(count/lns, 2) for count in p.values())
    except Exception as e:
        return pd.np.nan

def ip_inv_embedding_ext(ip):
    tranformed = None
    try:
        tranformed = str(ipaddress.ip_address(ip))
        return tranformed
    except:
        return tranformed

def enrich_names(bro_df):
    bro_df['Hostname_orig'] = bro_df['id.orig_h'].swifter.apply(lambda x : d.get(x,np.nan))
    bro_df['Hostname_resp'] = bro_df['id.resp_h'].swifter.apply(lambda x : d.get(x,np.nan))
    return bro_df


### Preprocessing

Applied various preprocessing function to clean raw bro logs.

In [8]:
def preprocessing(bro_df):
    print('preprocessing....') 
    print('4. Cleaning Services.....')
    bro_df=bro_df[bro_df['service'].isin(['http','dns','dhcp','ssl','ssh'])]
    bro_df['orig_bytes'] = pd.to_numeric(bro_df['orig_bytes'],errors='coerce')
    bro_df['orig_bytes'] = bro_df['orig_bytes'].fillna(0)
    
    print('1. Cleaning/ Validating IP address.....')
    bro_df['id.orig_h'] = bro_df['id.orig_h'].swifter.apply(lambda x : validate_ip(x))
    bro_df['id.resp_h'] = bro_df['id.resp_h'].swifter.apply(lambda x : validate_ip(x))
    bro_df = bro_df[bro_df['id.orig_h'].notna() & bro_df['id.resp_h'].notna()]
    
    print('2. Assinging Names to IP address.....')
    bro_df['Hostname_orig'] = bro_df['id.orig_h'].swifter.apply(lambda x : d.get(x,np.nan))
    bro_df['Hostname_resp'] = bro_df['id.resp_h'].swifter.apply(lambda x : d.get(x,np.nan))

    print('3. Generating direction..........')
    bro_df['direction']=bro_df.swifter.apply(lambda row :traffic_direction(row) , axis = 1) 
   
 
    
    print('5. Cleansing resp bytes')
    bro_df['resp_bytes'] = pd.to_numeric(bro_df['resp_bytes'],errors='coerce')
    bro_df['resp_bytes'] = bro_df['resp_bytes'].fillna(0)
    
    print('6. Cleansing resp packets')
    bro_df['resp_pkts'] = pd.to_numeric(bro_df['resp_pkts'],errors='coerce')
    bro_df['resp_pkts'] = bro_df['resp_pkts'].fillna(0)
    
    print('7. Cleansing orig_ip_bytes ')
    bro_df['orig_ip_bytes'] = pd.to_numeric(bro_df['orig_ip_bytes'],errors='coerce')
    bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].fillna(0)

    print('8. Cleansing resp_ip_bytes ')
    bro_df['resp_ip_bytes'] = pd.to_numeric(bro_df['resp_ip_bytes'],errors='coerce')
    bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].fillna(0)

    print('9. Cleansing id.resp_p')
    bro_df['id.resp_p'] = pd.to_numeric(bro_df['id.resp_p'],errors='coerce')
    bro_df['id.resp_p'] = bro_df['id.resp_p'].fillna(0)
    
    print('10. Cleansing id.orig_p')
    bro_df['id.orig_p'] = pd.to_numeric(bro_df['id.resp_p'],errors='coerce')
    bro_df['id.orig_p'] = bro_df['id.resp_p'].fillna(0)
    
    print('11. Converting Duration to Seconds')
    bro_df['durationsec'] = pd.to_numeric(bro_df.duration,errors='coerce')
    bro_df['durationsec'] = bro_df['durationsec'].fillna(0)
    
    #bro_df['id.orig_h_e']=bro_df['id.orig_h'].apply(lambda x : ip_embedding_ext(x))
    return bro_df

### Apply Aggregation 

After preprocessing. We engineered 20 features from raw bro logs and aggregated them based on Source Address, Destination Address & 5 minutes time windows.


In [9]:
import pandas as pd
def getFeatures(df):
    window_size = 1000
    total_batch = int(len(df)/window_size)
    trainX = pd.DataFrame()
    # Loop over all batches
    for i in range(total_batch):
        n = min(window_size, len(df)-i*window_size)
        df_win = df[i*window_size:i*window_size+n]
        df_win['ts'] = pd.to_datetime(df_win['ts'])
        df_win.set_index(['SrcAddr','DstAddr','ts'],inplace=True,drop=True)
        group = df_win.groupby([pd.Grouper(level='SrcAddr'),pd.Grouper(level='DstAddr'),pd.Grouper(level='ts',freq='5T')])
        #df_win.groupby(['SrcAddr'])
        train_win = group.Sport.nunique()
        train_win = pd.DataFrame(train_win)
        #train_win['udest_ip'] = group.DstAddr.nunique()
        train_win['udest_port'] = group.Dport.nunique()
        train_win['netflows'] = group.Dport.count()
        train_win['uproto'] = group.Proto.nunique()
        train_win['TotDur'] = group.Dur.sum()
        train_win['minDur'] = group.Dur.min()
        train_win['maxDur'] = group.Dur.max()
        temp = group.Proto.value_counts().unstack().fillna(0)
        try:
            train_win['tcp'] = temp['tcp']
        except:
            train_win['tcp'] = group.Dport.count()*0
        try:
            train_win['udp'] = temp['udp']
        except:
             train_win['udp'] = group.Dport.count()*0
                
                
                
                
        temp = group.dir.value_counts().unstack().fillna(0)
        try:
            train_win['internal'] = temp['internal']
        except:
            train_win['internal'] = 0
        try:
            train_win['incoming'] = temp['incoming']
        except:
             train_win['incoming'] = 0
        try:
            train_win['outgoing'] = temp['outgoing']
        except:
             train_win['outgoing'] = 0
        
        temp = group.service.value_counts().unstack().fillna(0)
        try:
            train_win['http'] = temp['http']
        except:
            train_win['http'] = 0
        try:
            train_win['dns'] = temp['dns']
        except:
             train_win['dns'] = 0
        try:
            train_win['dhcp'] = temp['dhcp']
        except:
             train_win['dhcp'] = 0
        try:
            train_win['ssl'] = temp['ssl']
        except:
             train_win['ssl'] = 0
        try:
            train_win['ssh'] = temp['ssh']
        except:
             train_win['ssh'] = 0
                
                
        train_win['bytes'] = group.TotBytes.sum()
        train_win['packets'] = group.TotPkts.sum()
        #train_win['Label'] = (group.Label.sum()*1.0/group.Dport.count())> 0
        trainX = pd.concat([trainX, train_win], axis=0)
    return trainX.round(2)

In [10]:
%%time 
bro_df=preprocessing(bro_df)

preprocessing....
4. Cleaning Services.....
1. Cleaning/ Validating IP address.....


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=11042093.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=11042093.0, style=ProgressStyle(descri…


2. Assinging Names to IP address.....


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=11042093.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=11042093.0, style=ProgressStyle(descri…


3. Generating direction..........


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=11042093.0, style=ProgressStyle(descri…


5. Cleansing resp bytes
6. Cleansing resp packets
7. Cleansing orig_ip_bytes 
8. Cleansing resp_ip_bytes 
9. Cleansing id.resp_p
10. Cleansing id.orig_p
11. Converting Duration to Seconds
Wall time: 9min 55s


In [11]:
bro_df['TotPkts']= pd.to_numeric(bro_df['orig_pkts'],errors='coerce')+pd.to_numeric(bro_df['resp_pkts'],errors='coerce')
bro_df['TotBytes'] = pd.to_numeric(bro_df['resp_bytes'],errors='coerce')+pd.to_numeric(bro_df['orig_bytes'],errors='coerce')

In [12]:
df_test = bro_df[['ts','id.orig_h','uid','id.resp_h','proto','id.resp_p','id.orig_p','TotBytes','TotPkts','durationsec','direction','service']]
df_test.columns=['ts','SrcAddr','session_id','DstAddr','Proto','Dport','Sport','TotBytes','TotPkts','Dur','dir','service']

In [13]:
df_test.shape

(11042093, 12)

In [36]:
11042093%8

5

In [14]:
splits = np.split(df_test.iloc[df_test.shape[0] % 8:,:],8)

In [15]:
from joblib import delayed,Parallel
list_df = Parallel(n_jobs=8)(delayed(getFeatures)(i) for i in splits)
df_test = pd.concat(list_df, axis=0)

In [16]:
joblib.dump(df_test,'feature_set/con_featureset_v2.pkl')

['feature_set/con_featureset_v2.pkl']

In [17]:
df_test.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.1.1.210,10.1.1.255,2020-03-31 19:55:00,1,1,1,1,1.51,1.51,1.51,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,300,6
10.1.1.23,224.0.0.251,2020-03-31 20:05:00,1,1,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,82,2
10.1.1.5,10.1.1.255,2020-03-31 19:55:00,1,1,1,1,1.52,1.52,1.52,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,300,6
10.1.1.5,10.1.1.255,2020-03-31 20:00:00,1,1,6,1,9.31,1.53,1.57,0.0,6.0,6.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,1800,36
10.1.1.5,10.1.1.255,2020-03-31 20:05:00,1,1,5,1,15.12,1.5,8.93,0.0,5.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,1800,36


In [18]:
bro_log ='conn'

In [19]:
if 'conn' in bro_log:
    log_type = 'conn'
    features =['Sport', 'udest_port', 'netflows', 'uproto', 'TotDur', 'minDur', 'maxDur', 'tcp', 'udp', 'internal', 'incoming', 'outgoing', 'http', 'dns', 'dhcp', 'ssl', 'ssh','bytes','packets']
    #features = ['durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes','service','proto','direction']
elif 'dns' in bro_log:
    log_type = 'dns'
    features = ['Z', 'proto', 'qtype_name','id.orig_p','id.resp_p', 'query_length', 'answer_length', 'entropy']
else:
    print('This example only works with Zeek with http.log or dns.log files..')
    sys.exit(1)

# Create a Pandas dataframe from a Zeek log
try:
    log_to_df = log_to_dataframe.LogToDataFrame()
    #bro_df = pd.read_csv('http_Farrukh-Naveed-Anjum.csv')#log_to_df.create_dataframe(bro_log)
    #print(bro_df.head())
except IOError:
    print('Could not open or parse the specified logfile: %s' % bro_log)
    sys.exit(1)
print('Read in {:d} Rows...'.format(len(bro_df)))

# Using Pandas we can easily and efficiently compute additional data metrics
# Here we use the vectorized operations of Pandas/Numpy to compute query length
# We'll also compute entropy of the query
if log_type == 'dns':
    bro_df['query_length'] = bro_df['query'].str.len()
    bro_df['answer_length'] = bro_df['answers'].str.len()
    bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))
# Use the zat DataframeToMatrix class
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
bro_matrix = to_matrix.fit_transform(df_test[features])
print(bro_matrix.shape)

Read in 11042093 Rows...
Normalizing column Sport...
Normalizing column udest_port...
Normalizing column netflows...
Normalizing column uproto...
Normalizing column TotDur...
Normalizing column minDur...
Normalizing column maxDur...
Normalizing column tcp...
Normalizing column udp...
Normalizing column internal...
Normalizing column incoming...
Normalizing column outgoing...
Normalizing column http...
Normalizing column dns...
Normalizing column dhcp...
Normalizing column ssl...
Normalizing column ssh...
Normalizing column bytes...
Normalizing column packets...
(2079239, 19)


In [20]:
# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(behaviour='new',contamination=0.0005,n_jobs=-1,verbose=0)  # Marking 10% as odd
odd_clf.fit(bro_matrix)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.0005,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=-1, random_state=None, verbose=0, warm_start=False)

In [21]:
conn_pickle = {'model':odd_clf,'tranformer':to_matrix}

In [22]:
with open('../models/iiesco_iforest_conn_v0.6.pkl', 'wb') as pickle_file:
    joblib.dump(conn_pickle, pickle_file)


In [23]:
# Now we create a new dataframe using the prediction from our classifier
predictions = odd_clf.predict(bro_matrix)

In [24]:
odd_df = df_test[features][predictions == -1]
display_df = df_test[predictions == -1]

In [25]:
# Now we're going to explore our odd observations with help from KMeans
odd_matrix = to_matrix.fit_transform(odd_df)
num_clusters = min(len(odd_df), 4)  # 4 clusters unless we have less than 4 observations
display_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)
print(odd_matrix.shape)


Normalizing column Sport...
Normalizing column udest_port...
Normalizing column netflows...
Normalizing column uproto...
Normalizing column TotDur...
Normalizing column minDur...
Normalizing column maxDur...
Normalizing column tcp...
Normalizing column udp...
Normalizing column internal...
Normalizing column incoming...
Normalizing column outgoing...
Normalizing column http...
Normalizing column dns...
Normalizing column dhcp...
Cannot normalize series (div by 0) so not normalizing...
Normalizing column ssl...
Normalizing column ssh...
Cannot normalize series (div by 0) so not normalizing...
Normalizing column bytes...
Normalizing column packets...
(944, 19)


In [26]:
pd.set_option('display.max_rows', 100)
outputdf = pd.DataFrame()
# Now group the dataframe by cluster
# if log_type == 'dns':
#     features += ['query']
#     features+=['ts']  
#     features += ['id.resp_h']

# features += ['id.orig_h']
# features += ['ts']
# features +=['id.resp_h']
# features +=['direction']

In [27]:
display_df =display_df[(display_df['incoming']>0)  | (display_df['outgoing']>0)]
#display(display_df.sort_values(by=['TotDur','tcp','udp','internal','incoming','outgoing','http','dns','dhcp','ssl','ssh','bytes','packets'],ascending=False))
display_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets,cluster
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
39.50.49.41,172.16.2.211,2020-04-01 06:10:00,1,1,19,1,134.22,0.27,120.49,19.0,0.0,0.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,5264,201,3
39.50.24.120,172.16.2.211,2020-04-01 16:25:00,1,1,20,1,155.53,0.49,122.21,20.0,0.0,0.0,20.0,0.0,20.0,0.0,0.0,0.0,0.0,5541,205,3
172.16.2.90,192.144.82.13,2020-04-02 01:40:00,1,1,77,1,110.35,0.51,13.76,77.0,0.0,0.0,0.0,77.0,77.0,0.0,0.0,0.0,0.0,60335422,76813,2
172.16.2.90,192.144.82.13,2020-04-02 01:45:00,1,1,75,1,80.59,0.66,2.96,75.0,0.0,0.0,0.0,75.0,75.0,0.0,0.0,0.0,0.0,28472426,35340,2
172.16.2.90,192.144.82.13,2020-04-02 01:45:00,1,1,137,1,197.28,0.66,13.20,137.0,0.0,0.0,0.0,137.0,137.0,0.0,0.0,0.0,0.0,162550982,186072,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172.16.2.211,8.8.8.8,2020-04-13 06:20:00,1,1,32,1,256.05,8.00,8.00,0.0,32.0,0.0,0.0,32.0,0.0,32.0,0.0,0.0,0.0,6825,160,0
172.16.2.211,8.8.8.8,2020-04-13 06:30:00,1,1,30,1,240.05,8.00,8.01,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6415,150,0
172.16.2.211,8.8.8.8,2020-04-13 06:50:00,1,1,30,1,240.04,8.00,8.01,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6300,150,0
172.16.2.211,8.8.8.8,2020-04-13 06:55:00,1,1,33,1,264.07,8.00,8.01,0.0,33.0,0.0,0.0,33.0,0.0,33.0,0.0,0.0,0.0,6990,165,0


In [28]:
display_df['incoming'].sum()
display_df['outgoing'].sum()
display_df['internal'].sum()

0.0

### Out of two million six hundred sixty-two thousand eight hundred sixty-three records 164 seems anamolous, which are further clustered down based on their similarity 

In [29]:
pd.set_option('display.max_rows', 100)
outputdf = pd.DataFrame()
# Now group the dataframe by cluster
# if log_type == 'dns':
#     features += ['query']
#     features+=['ts']  
#     features += ['id.resp_h']

# features += ['id.orig_h']
# features += ['ts']
# features +=['id.resp_h']
# features +=['direction']

cluster_groups = display_df[features+['cluster']].groupby('cluster')

# Now print out the details for each cluster
print('<<< Outliers Detected! >>>')
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    group = group[['Sport', 'udest_port', 'netflows', 'uproto', 'TotDur', 'minDur', 'maxDur', 'tcp', 'udp', 'internal', 'incoming', 'outgoing', 'http', 'dns', 'dhcp', 'ssl', 'ssh','bytes','packets']]
    #group = group.T.drop_duplicates().T
    #outputdf = pd.concat([outputdf,group],axis=0)
    group.sort_values(by=['TotDur','tcp','udp','internal','incoming','outgoing','http','dns','dhcp','ssl','ssh','bytes','packets'],ascending=False).to_csv(str(key)+'.csv')
    display(group.sort_values(by=['TotDur','tcp','udp','internal','incoming','outgoing','http','dns','dhcp','ssl','ssh','bytes','packets'],ascending=False))
    

<<< Outliers Detected! >>>

Cluster 0: 432 observations


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172.16.2.211,8.8.8.8,2020-04-09 22:50:00,1,1,50,1,400.08,8.0,8.01,0.0,50.0,0.0,0.0,50.0,0.0,50.0,0.0,0.0,0.0,10350,250
172.16.2.211,8.8.8.8,2020-04-10 01:50:00,1,1,45,1,360.09,8.0,8.01,0.0,45.0,0.0,0.0,45.0,0.0,45.0,0.0,0.0,0.0,9370,225
172.16.2.211,8.8.8.8,2020-04-10 22:50:00,1,1,45,1,360.07,8.0,8.01,0.0,45.0,0.0,0.0,45.0,0.0,45.0,0.0,0.0,0.0,9290,225
172.16.2.211,8.8.8.8,2020-04-08 13:40:00,1,1,44,1,352.07,8.0,8.01,0.0,44.0,0.0,0.0,44.0,0.0,44.0,0.0,0.0,0.0,9495,220
172.16.2.211,8.8.8.8,2020-04-11 09:50:00,1,1,44,1,352.07,8.0,8.01,0.0,44.0,0.0,0.0,44.0,0.0,44.0,0.0,0.0,0.0,9125,220
172.16.2.211,8.8.8.8,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172.16.2.211,8.8.8.8,2020-04-12 14:15:00,1,1,30,1,240.04,8.0,8.01,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6415,150
172.16.2.211,8.8.8.8,2020-04-12 02:55:00,1,1,30,1,240.04,8.0,8.00,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6395,150
172.16.2.211,8.8.8.8,2020-04-13 06:50:00,1,1,30,1,240.04,8.0,8.01,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6300,150
172.16.2.211,8.8.8.8,2020-04-11 07:50:00,1,1,30,1,240.04,8.0,8.00,0.0,30.0,0.0,0.0,30.0,0.0,30.0,0.0,0.0,0.0,6240,150



Cluster 1: 40 observations


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172.16.2.211,51.83.238.241,2020-04-03 05:25:00,1,1,1,1,161129.11,161129.11,161129.11,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4582,36487
172.16.2.211,217.182.199.184,2020-01-22 13:45:00,1,1,1,1,86008.05,86008.05,86008.05,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4583,19503
172.16.2.211,51.83.239.142,2020-04-02 13:05:00,1,1,1,1,56253.35,56253.35,56253.35,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4584,12753
172.16.2.53,93.184.221.240,2020-03-05 11:00:00,1,1,25,1,3363.41,131.45,206.87,25.0,0.0,0.0,0.0,25.0,25.0,0.0,0.0,0.0,0.0,156088401,165306
172.16.2.53,207.189.124.32,2020-03-05 10:30:00,1,1,4,1,3016.3,751.32,755.71,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,56194,252
172.16.2.53,23.50.146.63,2020-03-05 10:30:00,1,1,3,1,2012.09,622.06,747.79,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,878195894,930563
172.16.2.53,151.101.12.157,2020-03-12 07:55:00,1,1,2,1,1248.18,601.59,646.59,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,10272,93
172.16.2.53,23.213.140.183,2020-03-11 22:50:00,1,1,2,1,1181.72,590.73,590.99,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,25450,106
172.16.2.53,13.35.15.46,2020-03-11 22:50:00,1,1,4,1,1141.83,240.31,330.51,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,36961,158
172.16.2.53,23.50.146.63,2020-03-05 10:35:00,1,1,2,1,1071.84,535.85,535.99,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,52298321,55163



Cluster 2: 7 observations


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172.16.2.53,93.184.221.240,2020-03-05 10:55:00,1,1,34,1,4519.28,95.44,144.29,34.0,0.0,0.0,0.0,34.0,34.0,0.0,0.0,0.0,0.0,24068134,26088
172.16.2.90,192.144.82.13,2020-04-02 01:45:00,1,1,137,1,197.28,0.66,13.2,137.0,0.0,0.0,0.0,137.0,137.0,0.0,0.0,0.0,0.0,162550982,186072
172.16.2.90,192.144.82.13,2020-04-02 01:50:00,1,1,92,1,160.53,0.51,33.68,92.0,0.0,0.0,0.0,92.0,92.0,0.0,0.0,0.0,0.0,175084866,191470
172.16.2.90,192.144.82.13,2020-04-02 01:50:00,1,1,79,1,141.86,0.51,24.5,79.0,0.0,0.0,0.0,79.0,79.0,0.0,0.0,0.0,0.0,161240158,178350
172.16.2.90,192.144.82.13,2020-04-02 01:55:00,1,1,73,1,137.38,0.66,37.46,73.0,0.0,0.0,0.0,73.0,73.0,0.0,0.0,0.0,0.0,171555713,186812
172.16.2.90,192.144.82.13,2020-04-02 01:40:00,1,1,77,1,110.35,0.51,13.76,77.0,0.0,0.0,0.0,77.0,77.0,0.0,0.0,0.0,0.0,60335422,76813
172.16.2.90,192.144.82.13,2020-04-02 01:45:00,1,1,75,1,80.59,0.66,2.96,75.0,0.0,0.0,0.0,75.0,75.0,0.0,0.0,0.0,0.0,28472426,35340



Cluster 3: 32 observations


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sport,udest_port,netflows,uproto,TotDur,minDur,maxDur,tcp,udp,internal,incoming,outgoing,http,dns,dhcp,ssl,ssh,bytes,packets
SrcAddr,DstAddr,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39.41.2.158,172.16.2.211,2020-03-04 08:15:00,1,1,12,1,2349.3,143.19,214.04,12.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,0.0,0.0,2440803,3798
39.41.2.158,172.16.2.211,2020-03-03 04:20:00,1,1,15,1,996.92,43.82,84.69,15.0,0.0,0.0,15.0,0.0,15.0,0.0,0.0,0.0,0.0,385337,1352
39.50.104.121,172.16.2.211,2020-04-08 09:15:00,1,1,19,1,487.9,0.76,188.23,19.0,0.0,0.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,5265,199
39.40.85.165,172.16.2.211,2020-03-10 06:10:00,2,2,16,1,256.74,0.24,130.62,16.0,0.0,0.0,16.0,0.0,16.0,0.0,0.0,0.0,0.0,191429,233
39.50.1.121,172.16.2.211,2020-04-02 10:05:00,1,1,18,1,202.52,1.74,131.71,18.0,0.0,0.0,18.0,0.0,18.0,0.0,0.0,0.0,0.0,4987,189
39.50.5.79,172.16.2.211,2020-03-20 06:30:00,1,1,18,1,184.52,0.69,132.16,18.0,0.0,0.0,18.0,0.0,18.0,0.0,0.0,0.0,0.0,4987,183
39.50.102.237,172.16.2.211,2020-03-31 07:40:00,1,1,19,1,164.42,0.36,132.22,19.0,0.0,0.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,5264,202
39.50.83.87,172.16.2.211,2020-03-14 14:40:00,1,1,19,1,157.74,0.27,121.35,19.0,0.0,0.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,5264,209
39.50.24.120,172.16.2.211,2020-04-01 16:25:00,1,1,20,1,155.53,0.49,122.21,20.0,0.0,0.0,20.0,0.0,20.0,0.0,0.0,0.0,0.0,5541,205
39.50.31.235,172.16.2.211,2020-04-02 07:25:00,1,1,19,1,153.91,0.3,127.9,19.0,0.0,0.0,19.0,0.0,19.0,0.0,0.0,0.0,0.0,5264,202


In [30]:
display_df[features+['cluster']].groupby('cluster')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001828E0B1518>

<html>
    <center>
        <hr>
        <h1>The End ! </h1>
    </center>
    </html>

In [31]:
from IPython.core.display import HTML
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import _plotly_future_
from _plotly_future_ import v4_subplots
import plotly.plotly as pl
from plotly import __version__
print(__version__)
import cufflinks as cf
import matplotlib.pyplot as plt
import joblib

from plotly import __version__ 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

import plotly.graph_objs as go
from cufflinks import tools
from plotly.subplots import make_subplots

from sklearn.metrics import r2_score
%matplotlib inline

3.10.0


In [32]:
### Case:1
testdf = df_test.reset_index()
testdf = testdf[(testdf['SrcAddr']=='172.16.2.90')]
testdf.set_index('ts',inplace=True)
testdf.sort_index(inplace=True)
testdf[['bytes','packets','TotDur']].iplot(yTitle='count',xTitle='Timestamp',title='IP :172.16.2.211 Service: SSL')

In [33]:
### Case:1
testdf = df_test.reset_index()
testdf = testdf[(testdf['SrcAddr']=='172.16.2.210')]
testdf.set_index('ts',inplace=True)
testdf.sort_index(inplace=True)
testdf[['bytes','packets']].iplot(yTitle='count',xTitle='Timestamp',title='IP :172.16.2.210 Service: DNS')

In [34]:
from IPython.html import widgets
from IPython.display import display, clear_output
from plotly.widgets import GraphWidget



def plot_graph(text):
    container = widgets.HBox(children=[text_input,text_input2, button, valid, message])

    try:
        testdf = df_test.reset_index()
        testdf = testdf[(testdf['SrcAddr']==text_input.value)]
        testdf.set_index('ts',inplace=True)
        testdf.sort_index(inplace=True)
        g2.plot({'data': [{'x': testdf.index, 'y': testdf[text_input2.value] }]})
        g2.relayout({'title': 'Source Address in {}'.format(text_input.value)}) 
    except:
        pass

g2 = GraphWidget('https://plot.ly/~kevintest/1178/')
button = widgets.Button(description="Submit")

text_input = widgets.Text(
    description='IP Address:',
    value='172.16.2.211',
)

text_input2= widgets.Dropdown(
    options=['Sport', 'udest_ip', 'udest_port', 'netflows', 'uproto', 'TotDur', 'minDur', 'maxDur', 'tcp', 'udp', 'internal',
    'incoming', 'outgoing', 'http', 'dns', 'dhcp', 'ssl', 'ssh', 'bytes', 'packets'],
    value='bytes',
    description='Number:',
    disabled=False,
)
message = widgets.HTML(
    value="",
)


valid = widgets.Valid(
    value=True,
)

# this will be initalize our listener
button.on_click(plot_graph)


container = widgets.HBox(children=[text_input,text_input2, button, valid, message])
display(container)
display(g2)

<IPython.core.display.Javascript object>

HBox(children=(Text(value='172.16.2.211', description='IP Address:'), Dropdown(description='Number:', index=18…

GraphWidget()

In [35]:
df_test[['http','dns','dhcp','ssl','ssh']].

SyntaxError: invalid syntax (<ipython-input-35-e1770bbc673a>, line 1)

In [None]:
getFeatures(df_test.iloc[1:100000,:])

In [None]:
bro_df.iloc[1:10000,:].to_json('Anamoly_test.json')

In [None]:
display_df.reset_index().iloc[:100]

In [None]:
df_test.reset_index()[df_test.reset_index()['SrcAddr']=='39.32.25.72'][['http','dns','dhcp','ssl','ssh']].iplot()

In [None]:
bro_df.head()

In [None]:
df_test.reset_index()[df_test.reset_index()['SrcAddr']=='39.32.25.72'][['http','dns','dhcp','ssl','ssh']]

In [None]:
df_test['Prediction'] = predictions

In [None]:
df_test['Prediction']