In [1]:
import urllib.request
f = urllib.request.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

In [2]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

In [3]:
normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)
attack_raw_data = raw_data.subtract(normal_raw_data)

In [5]:
from time import time

# count raw data
t0 = time()
raw_data_count = raw_data.count()
tt = time() - t0
print("Raw data count completed in {} seconds".format(round(tt, 3)))

# count normal data
t0 = time()
normal_data_count = normal_raw_data.count()
tt = time() - t0
print("Normal data count completed in {} seconds".format(round(tt, 3)))

# count attack data
t0 = time()
attack_data_count = attack_raw_data.count()
tt = time() - t0
print("Attack data count completed in {} seconds".format(round(tt, 3)))
print("There are {} normal interactions and {} attack interactions with total {} interactions".format(normal_data_count, attack_data_count, raw_data_count))

Raw data count completed in 0.961 seconds
Normal data count completed in 1.032 seconds
Attack data count completed in 5.254 seconds
There are 97278 normal interactions and 396743 attack interactions with total 494021 interactions


In [7]:
attack_raw_data.take(5)

['305,tcp,telnet,SF,1735,2766,0,0,0,3,0,1,2,1,0,0,1,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,2,4,1.00,0.00,0.50,0.50,0.00,0.00,0.00,0.00,buffer_overflow.',
 '0,tcp,telnet,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,5,0.83,1.00,0.00,0.00,0.83,0.33,0.00,5,6,1.00,0.00,0.20,0.33,1.00,0.83,0.00,0.00,neptune.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,123,0.48,0.01,0.48,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,133,0.52,0.01,0.52,0.00,0.00,0.00,0.00,0.00,smurf.',
 '0,icmp,ecr_i,SF,1032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,511,511,0.00,0.00,0.00,0.00,1.00,0.00,0.00,255,183,0.72,0.01,0.72,0.00,0.00,0.00,0.00,0.00,smurf.']

In [8]:
normal_raw_data.take(5)

['0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.',
 '0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']

In [17]:
csv_data = raw_data.map(lambda x: x.split(','))
protocols = csv_data.map(lambda x: x[1]).distinct()

In [25]:
protocols.collect()

['tcp', 'udp', 'icmp']

In [24]:
services = csv_data.map(lambda x: x[2]).distinct()
services.collect()

['http',
 'smtp',
 'finger',
 'domain_u',
 'auth',
 'telnet',
 'ftp',
 'eco_i',
 'ntp_u',
 'ecr_i',
 'other',
 'private',
 'pop_3',
 'ftp_data',
 'rje',
 'time',
 'mtp',
 'link',
 'remote_job',
 'gopher',
 'ssh',
 'name',
 'whois',
 'domain',
 'login',
 'imap4',
 'daytime',
 'ctf',
 'nntp',
 'shell',
 'IRC',
 'nnsp',
 'http_443',
 'exec',
 'printer',
 'efs',
 'courier',
 'uucp',
 'klogin',
 'kshell',
 'echo',
 'discard',
 'systat',
 'supdup',
 'iso_tsap',
 'hostnames',
 'csnet_ns',
 'pop_2',
 'sunrpc',
 'uucp_path',
 'netbios_ns',
 'netbios_ssn',
 'netbios_dgm',
 'sql_net',
 'vmnet',
 'bgp',
 'Z39_50',
 'ldap',
 'netstat',
 'urh_i',
 'X11',
 'urp_i',
 'pm_dump',
 'tftp_u',
 'tim_i',
 'red_i']

In [23]:
product = protocols.cartesian(services).collect()
print("There are {} combinations of protocols X services".format(len(product)))

There are 198 combinations of protocols X services


In [26]:
type(protocols),type(product)

(pyspark.rdd.PipelinedRDD, list)