In [1]:
from pyspark import SparkContext, SparkConf
import urllib

f = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")


In [2]:
# prepare the RDD
data_file = "./kddcup.data_10_percent.gz"

conf = SparkConf().setAppName("KDD_Sets")
sc = SparkContext(conf=conf)

raw_data = sc.textFile(data_file)

In [3]:
# get the Normal interactions from the RDD
normal = raw_data.filter(lambda x: 'normal.' in x)

In [4]:
# get the ATTACK interactions
# We can simply subtract normal ones from raw
attack = raw_data.subtract(normal)

In [7]:
from time import time

t0=time()
# print the results
print "Total interactions = {}".format(raw_data.count())
print "Of which Normal interactions = {}".format(normal.count())
print "Of which Attack interactions = {}".format(attack.count())
t1=time()
print "Time taken to count and print - {} sec".format(t1-t0)

Total interactions = 494021
Of which Normal interactions = 97278
Of which Attack interactions = 396743
Time taken to count and print - 23.0620000362 sec


In [8]:
attack.take(8)

[u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,244,2,1.00,1.00,0.00,0.00,0.01,0.06,0.00,255,2,0.01,0.08,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,244,2,1.00,1.00,0.00,0.00,0.01,0.06,0.00,255,2,0.01,0.08,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280,7,1.00,1.00,0.00,0.00,0.03,0.05,0.00,255,7,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280,7,1.00,1.00,0.00,0.00,0.03,0.05,0.00,255,7,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280,7,1.00,1.00,0.00,0.00,0.03,0.05,0.00,255,7,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280,7,1.00,1.00,0.00,0.00,0.03,0.05,0.00,255,7,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune.',
 u'0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,280,7,1.00,1.00,0.00

In [9]:
# we generate all the possible combinations between service and protocol in our network interactions
# In the file protocol is col 2 and service is col 3
csv = raw_data.map( lambda x : x.split(','))
protocol = csv.map(lambda x: x[1]).distinct()
protocol.collect()


[u'udp', u'icmp', u'tcp']

In [11]:
# Now same for services
service = csv.map(lambda x: x[2]).distinct()
service.collect()

[u'domain',
 u'http_443',
 u'Z39_50',
 u'smtp',
 u'urp_i',
 u'private',
 u'echo',
 u'shell',
 u'red_i',
 u'eco_i',
 u'sunrpc',
 u'ftp_data',
 u'urh_i',
 u'pm_dump',
 u'pop_3',
 u'pop_2',
 u'systat',
 u'ftp',
 u'uucp',
 u'whois',
 u'netbios_dgm',
 u'efs',
 u'remote_job',
 u'daytime',
 u'ntp_u',
 u'finger',
 u'ldap',
 u'netbios_ns',
 u'kshell',
 u'iso_tsap',
 u'ecr_i',
 u'nntp',
 u'printer',
 u'domain_u',
 u'uucp_path',
 u'courier',
 u'exec',
 u'time',
 u'netstat',
 u'telnet',
 u'gopher',
 u'rje',
 u'sql_net',
 u'link',
 u'auth',
 u'netbios_ssn',
 u'csnet_ns',
 u'X11',
 u'IRC',
 u'tftp_u',
 u'login',
 u'supdup',
 u'name',
 u'nnsp',
 u'mtp',
 u'http',
 u'bgp',
 u'ctf',
 u'hostnames',
 u'klogin',
 u'vmnet',
 u'tim_i',
 u'discard',
 u'imap4',
 u'other',
 u'ssh']

In [13]:
# Now to generate all combos of protocol and service we can do CARTESIAN product
product = protocol.cartesian(service).collect()
print "There are {} combinations of protocol and service".format(len(product))

There are 198 combinations of protocol and service
