In [1]:
import re
import json
from colorama import Fore, Back, Style
import sys
import pickle
import pandas as pd
import numpy as np
import time

# Input File

In [2]:
data = 'datasets/flat-sample/namedStructureProperties_ip-accesslist.json'

# Parsing data from file

In [3]:
t1 = time.time()
props = []
datas = []
f = open(data)
for line in f:

    match = re.match('.*?:(.*)=>(.*);', line)

    try:
        props.append(match.group(1))
        extracted = match.group(2)
        extracted = '[' + extracted + ']'
        data = json.loads(extracted)

    except AttributeError: 
        pass

    for i in range(len(data)):
        data[i] = [data[i]]

    
    datas.append(data)

print("\nTime taken for Extraction is")
print(time.time()-t1)


Time taken for Extraction is
1.201422929763794


# Defining method to check the homogenity among a structure. 

In [4]:
def isHomogeneous(input_dict):

    values = list(input_dict.values())

    print(values)

    sum = 0
    max = 0
    for value in values:
        if value > max:
            max = value
        sum += value

    ratio = max / sum
    print(ratio)

    if ratio < 0.7:
        return False
    else:
        return True

# Method to parse a structure and get the keys and values

In [5]:
# Recursively extract keys from a dictionary
def extract_keys(the_dict, prefix=''):
    # TODO
    # fix bug with list of dicts not being extracted
    # but only first element

    key_list = []

    for key, value in the_dict.items():

        # set the prefix
        if len(prefix) == 0:
            new_prefix = key
        else:
            new_prefix = prefix + '.' + key

        # recursively call extract_keys for nested dicts
        if type(value) == dict:
            key_list.extend(extract_keys(value, new_prefix))
        elif type(value) == list and type(value[0]) == dict:
            key_list.extend(extract_keys(value[0], new_prefix))
        else:
            key_list.append(new_prefix)

    return key_list

# Saving the node names and deleting from data as it will not be an input for extract_keys method 

In [6]:
names = datas[0]

In [7]:
del datas[0]

# Calculating overall structure of our data

In [8]:
overall = {}

for data in datas:

    for item in data:

        if item[0] is None:
            continue

        result = extract_keys(item[0])
#         print(result)

        for element in result:

            value = item[0]
            for key in element.split('.'):

                new_value = value[key]
                if type(new_value) == list:
                    new_value = new_value[0]

                value = new_value

            # print(element, value)
            if element not in overall:
                # overall[element] = [value]
                overall[element] = {}

            if value not in overall[element]:
                overall[element][value] = 1
            else:
                overall[element][value] += 1

In [9]:
overall

{'lines.action': {'PERMIT': 5820},
 'lines.matchCondition.class': {'org.batfish.datamodel.acl.MatchHeaderSpace': 5820},
 'lines.matchCondition.headerSpace.dstIps.class': {'org.batfish.datamodel.IpWildcardIpSpace': 5820},
 'lines.matchCondition.headerSpace.dstIps.ipWildcard': {'0.0.0.0/0': 5820},
 'lines.matchCondition.headerSpace.dstPorts': {'161-161': 120, '500-500': 60},
 'lines.matchCondition.headerSpace.ipProtocols': {'OSPF': 840, 'UDP': 180},
 'lines.matchCondition.headerSpace.negate': {False: 5820},
 'lines.matchCondition.headerSpace.srcIps.class': {'org.batfish.datamodel.IpWildcardIpSpace': 5820},
 'lines.matchCondition.headerSpace.srcIps.ipWildcard': {'0.0.0.0/0': 1320,
  '126.137.164.0/22': 60,
  '126.170.160.0/20': 180,
  '126.172.0.0/14': 1080,
  '126.182.152.234': 2940,
  '126.62.201.0/24': 120,
  '126.62.63.0/24': 120},
 'lines.matchCondition.headerSpace.srcPorts': {'500-500': 60},
 'lines.name': {'10 permit ip 126.173.34.41/14 any': 1080,
  'permit 126.137.165.207 0.0.3.2

# Deciding which features to exclude from overall based on their homogenity

In [10]:
excluded = []

for key, value in overall.items():
    if isHomogeneous(value):
        print(Fore.GREEN + key, ": ", value)
    else:
        print(Fore.RED + key, ": ", value)
        excluded.append(key)
    print(Style.RESET_ALL)
    print()
print()

print('Excluded:', excluded)

[240, 240, 120, 4200, 240, 360, 120, 120, 60, 60, 60]
0.7216494845360825
[32msourceName :  {'HADOOP_SUBNET': 240, 'CRYPTO': 240, 'BRP_out': 120, '99': 4200, 'BRP_OUT': 240, 'Vital': 360, 'snmpline4': 120, 'InterNWOCX': 120, 'SHAPE_SCAN': 60, 'IKE': 60, 'static2ospf': 60}
[0m

[120, 240, 840, 1080, 2940, 120, 60, 180, 120, 60, 60]
0.5051546391752577
[31mlines.name :  {'permit ip 126.62.201.122 0.0.0.255 any': 120, 'permit ip any any': 240, 'permit ospf any any': 840, '10 permit ip 126.173.34.41/14 any': 1080, 'permit 126.182.152.234': 2940, 'permit udp any any eq snmp': 120, 'permit any': 60, 'permit 126.170.164.59 0.0.15.255': 180, 'permit ip 126.62.63.184 0.0.0.255 any': 120, 'permit udp any eq isakmp any eq isakmp': 60, 'permit 126.137.165.207 0.0.3.255': 60}
[0m

[5820]
1.0
[32mlines.action :  {'PERMIT': 5820}
[0m

[5820]
1.0
[32mlines.matchCondition.headerSpace.dstIps.class :  {'org.batfish.datamodel.IpWildcardIpSpace': 5820}
[0m

[5820]
1.0
[32mlines.matchCondition.header

# Calculating the Signature based on our Overall Data set

In [11]:
signature = {}

for key, value in overall.items():
    # print(key, value)
    max = 0
    sum = 0
    most = None
    for k, v in value.items():
        sum += v
        if v > max:
            max = v
            most = k
    weight = int(max / sum * 100)

    signature[key] = (most, weight)

In [12]:
signature

{'lines.action': ('PERMIT', 100),
 'lines.matchCondition.class': ('org.batfish.datamodel.acl.MatchHeaderSpace',
  100),
 'lines.matchCondition.headerSpace.dstIps.class': ('org.batfish.datamodel.IpWildcardIpSpace',
  100),
 'lines.matchCondition.headerSpace.dstIps.ipWildcard': ('0.0.0.0/0', 100),
 'lines.matchCondition.headerSpace.dstPorts': ('161-161', 66),
 'lines.matchCondition.headerSpace.ipProtocols': ('OSPF', 82),
 'lines.matchCondition.headerSpace.negate': (False, 100),
 'lines.matchCondition.headerSpace.srcIps.class': ('org.batfish.datamodel.IpWildcardIpSpace',
  100),
 'lines.matchCondition.headerSpace.srcIps.ipWildcard': ('126.182.152.234', 50),
 'lines.matchCondition.headerSpace.srcPorts': ('500-500', 100),
 'lines.name': ('permit 126.182.152.234', 50),
 'name': ('99', 72),
 'sourceName': ('99', 72),
 'sourceType': ('standard ipv4 access-list', 55)}

# Printing the Signature

In [13]:
print()
for key, value in overall.items():
    print(key, ':', value)
    print()

print(Fore.MAGENTA)
print(signature)
print(Style.RESET_ALL)


sourceName : {'HADOOP_SUBNET': 240, 'CRYPTO': 240, 'BRP_out': 120, '99': 4200, 'BRP_OUT': 240, 'Vital': 360, 'snmpline4': 120, 'InterNWOCX': 120, 'SHAPE_SCAN': 60, 'IKE': 60, 'static2ospf': 60}

lines.name : {'permit ip 126.62.201.122 0.0.0.255 any': 120, 'permit ip any any': 240, 'permit ospf any any': 840, '10 permit ip 126.173.34.41/14 any': 1080, 'permit 126.182.152.234': 2940, 'permit udp any any eq snmp': 120, 'permit any': 60, 'permit 126.170.164.59 0.0.15.255': 180, 'permit ip 126.62.63.184 0.0.0.255 any': 120, 'permit udp any eq isakmp any eq isakmp': 60, 'permit 126.137.165.207 0.0.3.255': 60}

lines.action : {'PERMIT': 5820}

lines.matchCondition.headerSpace.dstIps.class : {'org.batfish.datamodel.IpWildcardIpSpace': 5820}

lines.matchCondition.headerSpace.dstIps.ipWildcard : {'0.0.0.0/0': 5820}

lines.matchCondition.headerSpace.srcIps.class : {'org.batfish.datamodel.IpWildcardIpSpace': 5820}

lines.matchCondition.headerSpace.srcIps.ipWildcard : {'126.62.201.0/24': 120, '0.0

# Comparing the Signature with our Data Sample and checking for iutliers by scoring each structure.

In [14]:
for i, item in enumerate(datas[0]):

    match = 0
    total = 0
#     print('=' * 141, end='\n\n')
    print("Entry #%d" % i, end='\n\n')

    for data in datas:

        item = data[i]


#         print(item, end='\n\n')

        for key, value in signature.items():

            if key in excluded:
                continue

            current = item[0]
            if current is None:
                continue

            key_list = key.split('.')

            for k in key_list:
                # print(k)
                if k in current:
                    current = current[k]
                    if type(current) == list:
                        current = current[0]
                else:
                    break

            if type(current) == dict:
                continue

#             print(key)

            if current == value[0]:
                match += value[1]
#                 print(Fore.BLUE, end='')
            else:
                print(Fore.RED, end='')

#             print('Entry value: ', current)
#             print('Signature value: ', value[0])
#             print(Style.RESET_ALL)

            total += value[1]

    print(Fore.GREEN, end='')
    print(match, '/', total)
    print(Style.RESET_ALL)

Entry #0

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[32m36000 / 44640
[0m
Entry #1

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[32m36000 / 44640
[0m
Entry #38

[32m44640 / 44640
[0m
Entry #39

[32m44640 / 44640
[0m
Entry #40

[32m44640 / 44640
[0m
Entry #41

[32m44640 / 44640
[0m
Entry #42

[32m44640 / 44640
[0m
Entry #43

[32m44640 / 44640
[0m
Entry #44

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[32m36000 / 44640
[0m
Entry #85

[32m44640 / 44640
[0m
Entry #86

[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m[31m