In [1]:
from pyspark import SparkContext, SparkConf
import urllib

f = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

In [2]:
# prepare the RDD
data_file = "./kddcup.data_10_percent.gz"

conf = SparkConf().setAppName("KDD_MLlib")
sc = SparkContext(conf=conf)

raw_data = sc.textFile(data_file)

In [11]:
# represent each network interaction in our dataset as a dense vector
import numpy as np

def parse_interaction(line):
    line_split = line.split(',')
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i, item in enumerate (line_split) if i not in symbolic_indexes]
   
    return np.array([float(x) for x in clean_line_split])

vector_data = raw_data.map(parse_interaction)

In [7]:
vector_data

PythonRDD[2] at RDD at PythonRDD.scala:48

In [12]:
from pyspark.mllib.stat import Statistics 
from math import sqrt 

# Compute column summary statistics.
summary = Statistics.colStats(vector_data)

print "Duration Statistics:"
print " Mean: {}".format(round(summary.mean()[0],3))
print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
print " Max value: {}".format(round(summary.max()[0],3))
print " Min value: {}".format(round(summary.min()[0],3))
print " Total value count: {}".format(summary.count())
print " Number of non-zero values: {}".format(summary.numNonzeros()[0])

Duration Statistics:
 Mean: 47.979
 St. deviation: 707.746
 Max value: 58329.0
 Min value: 0.0
 Total value count: 494021
 Number of non-zero values: 12350.0


In [22]:
# obtain summary stats by the type of network attack or 'label' in our dataset
# filter our RDD containing labels as keys and vectors as values

def parse_interaction_with_key(line):
    line_split = line.split(',') 
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return (line_split[41], np.array([float(x) for x in clean_line_split]))       
            
label_vector_data = raw_data.map(parse_interaction_with_key)

In [23]:
normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")

In [26]:
normal_summary = Statistics.colStats(normal_label_data.values())

In [27]:
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_summary.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
print " Max value: {}".format(round(normal_summary.max()[0],3))
print " Min value: {}".format(round(normal_summary.min()[0],3))
print " Total value count: {}".format(normal_summary.count())
print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])

Duration Statistics for label: normal
 Mean: 216.657322313
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [28]:
# create a function so that we can reuse this logic for any label
def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
    return Statistics.colStats(label_vector_data.values())
    

In [29]:
# Using the above function for label 'normal.'
normal_sum = summary_by_label(raw_data, 'normal.')

print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_sum.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
print " Max value: {}".format(round(normal_sum.max()[0],3))
print " Min value: {}".format(round(normal_sum.min()[0],3))
print " Total value count: {}".format(normal_sum.count())
print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])

Duration Statistics for label: normal
 Mean: 216.657322313
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [33]:
# Now we will get stats for different types of labels

label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
              "imap.","ipsweep.","land.","loadmodule.","multihop.",
              "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
              "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
              "warezmaster."]

stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]

In [34]:
# we get the duration column, first in our dataset (i.e. index 0)
duration_by_label = [(stat[0],np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())])) 
    for stat in stats_by_label]

In [35]:
# Use Pandas
import pandas as pd
pd.set_option('display.max_columns', 50)

stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

In [36]:
print "Duration statistics, by label"
stats_by_label_df

Duration statistics, by label


Unnamed: 0,Mean,Std Dev,Min,Max,Count
back.,0.128915,1.110062,0,14,2203
buffer_overflow.,91.7,97.514685,0,321,30
ftp_write.,32.375,47.449033,0,134,8
guess_passwd.,2.716981,11.879811,0,60,53
imap.,6.0,14.17424,0,41,12
ipsweep.,0.034483,0.438439,0,7,1247
land.,0.0,0.0,0,0,21
loadmodule.,36.222222,41.408869,0,103,9
multihop.,184.0,253.851006,0,718,7
neptune.,0.0,0.0,0,0,107201


In [38]:
def get_variable_stats_df(stats_by_label, column_i):
    column_stats_by_label = [
        (stat[0], np.array([float(stat[1].mean()[column_i]), float(sqrt(stat[1].variance()[column_i])), float(stat[1].min()[column_i]), float(stat[1].max()[column_i]), int(stat[1].count())])) 
        for stat in stats_by_label
    ]
    return pd.DataFrame.from_items(column_stats_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

In [39]:
get_variable_stats_df(stats_by_label,0)

Unnamed: 0,Mean,Std Dev,Min,Max,Count
back.,0.128915,1.110062,0,14,2203
buffer_overflow.,91.7,97.514685,0,321,30
ftp_write.,32.375,47.449033,0,134,8
guess_passwd.,2.716981,11.879811,0,60,53
imap.,6.0,14.17424,0,41,12
ipsweep.,0.034483,0.438439,0,7,1247
land.,0.0,0.0,0,0,21
loadmodule.,36.222222,41.408869,0,103,9
multihop.,184.0,253.851006,0,718,7
neptune.,0.0,0.0,0,0,107201


In [40]:
print "src_bytes statistics, by label"
get_variable_stats_df(stats_by_label,1)

src_bytes statistics, by label


Unnamed: 0,Mean,Std Dev,Min,Max,Count
back.,54156.355878,3159.360232,13140,54540,2203
buffer_overflow.,1400.433333,1337.132616,0,6274,30
ftp_write.,220.75,267.747616,0,676,8
guess_passwd.,125.339623,3.03786,104,126,53
imap.,347.583333,629.926036,0,1492,12
ipsweep.,10.0834,5.231658,0,18,1247
land.,0.0,0.0,0,0,21
loadmodule.,151.888889,127.745298,0,302,9
multihop.,435.142857,540.960389,0,1412,7
neptune.,0.0,0.0,0,0,107201
