In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
data_file = "./kddcup.data.gz"
raw_data = sc.textFile(data_file)

In [4]:
print("Train data size is {}".format(raw_data.count()))

Train data size is 4898431


In [5]:
# load test data
import urllib.request as rq
ft = rq.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz")

In [7]:
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

print("Test data size is {}".format(test_raw_data.count()))

Test data size is 311029


In [8]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

def parse_interaction(line):
    line_split = line.split(",")
    # leave_out = [1,2,3,41]
    clean_line_split = line_split[0:1]+line_split[4:41]
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = raw_data.map(parse_interaction)

In [10]:
training_data.take(1)

[LabeledPoint(0.0, [0.0,215.0,45076.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])]

In [11]:
test_data = test_raw_data.map(parse_interaction)

In [12]:
test_data.take(1)

[LabeledPoint(0.0, [0.0,105.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0])]

In [23]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from time import time

# Build the model
t0 = time()
logit_model = LogisticRegressionWithLBFGS.train(training_data)
tt = time() - t0

print("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 2919.667 seconds


In [24]:
labels_and_preds = test_data.map(lambda p: (p.label, logit_model.predict(p.features)))

In [25]:
type(labels_and_preds)

pyspark.rdd.PipelinedRDD

In [26]:
labels_and_preds.take(5)

[(0.0, 0), (0.0, 0), (0.0, 0), (1.0, 0), (1.0, 0)]

In [27]:
t0 = time()
test_accuracy = (labels_and_preds.filter(lambda x: x[0] == x[1])).count() / float(test_data.count())
tt = time() - t0
print("Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)))

Prediction made in 17.196 seconds. Test accuracy is 0.8626


In [32]:
def parse_interaction_corr(line):
    line_split = line.split(',')
    clean_line_split = line_split[0:1] + \
    line_split[4:25] + \
    line_split[26:27] + \
    line_split[28:35] + \
    line_split[36:38]+line_split[39:40]
    attack = 1.0
    if line_split[41] == 'normal.':
        attack = 0.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

corr_reduced_training_data = raw_data.map(parse_interaction_corr)
corr_reduced_test_data = test_raw_data.map(parse_interaction_corr)

In [33]:
# Build new model
t0 = time()
logit_model_2 = LogisticRegressionWithLBFGS.train(corr_reduced_training_data)
tt = time() - t0

print("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 1600.278 seconds


In [35]:
labels_and_preds = corr_reduced_test_data.map(lambda p: (p.label, logit_model_2.predict(p.features)))

t0 = time()
test_accuracy = (labels_and_preds.filter(lambda x: x[0] == x[1])).count() / float(corr_reduced_test_data.count())
tt = time() - t0
print("Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)))

Prediction made in 16.06 seconds. Test accuracy is 0.8134


In [36]:
feature_names = ["land","wrong_fragment",
             "urgent","hot","num_failed_logins","logged_in","num_compromised",
             "root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds",
             "is_hot_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

In [37]:
def parse_interaction_categorical(line):
    line_split = line.split(",")
    clean_line_split = line_split[6:41]
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data_categorical = raw_data.map(parse_interaction_categorical)

In [38]:
from pyspark.mllib.stat import Statistics

chi = Statistics.chiSqTest(training_data_categorical)

In [39]:
type(chi)

list

In [40]:
chi[0:5]

[<pyspark.mllib.stat.test.ChiSqTestResult at 0x116a03278>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x116a03208>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x116a03390>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x116a03438>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x116a034e0>]

In [41]:
chi[0]

<pyspark.mllib.stat.test.ChiSqTestResult at 0x116a03278>

In [42]:
print(chi[0])

Chi squared test summary:
method: pearson
degrees of freedom = 1 
statistic = 0.46498353951455057 
pValue = 0.4953040728284166 
No presumption against null hypothesis: the occurrence of the outcomes is statistically independent..


In [52]:
import pandas as pd
pd.set_option('display.max_colwidth', 30)

records = [("{:.4f}".format(result.statistic), "{:.4f}".format(result.pValue)) for result in chi]

chi_df = pd.DataFrame(data=records, index= feature_names, columns=["Statistic","p-value"])

chi_df

Unnamed: 0,Statistic,p-value
land,0.465,0.4953
wrong_fragment,306.8555,0.0
urgent,38.7184,0.0
hot,19463.3143,0.0
num_failed_logins,127.7691,0.0
logged_in,3273098.0557,0.0
num_compromised,2011.8627,0.0
root_shell,1044.9179,0.0
su_attempted,434.0,0.0
num_root,22871.6756,0.0


In [56]:
print(chi[1])

Chi squared test summary:
method: pearson
degrees of freedom = 2 
statistic = 306.8555075496753 
pValue = 0.0 
Very strong presumption against null hypothesis: the occurrence of the outcomes is statistically independent..


In [54]:
def parse_interaction_chi(line):
    line_split = line.split(",")
    # leave_out = [1,2,3,6,19,41]
    clean_line_split = line_split[0:1] + line_split[4:6] + line_split[7:19] + line_split[20:41]
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data_chi = raw_data.map(parse_interaction_chi)
test_data_chi = test_raw_data.map(parse_interaction_chi)

In [57]:
# Build the model
t0 = time()
logit_model_chi = LogisticRegressionWithLBFGS.train(training_data_chi)
tt = time() - t0

print("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 1595.321 seconds


In [59]:
labels_and_preds = test_data_chi.map(lambda p: (p.label, logit_model_chi.predict(p.features)))
t0 = time()
test_accuracy = labels_and_preds.filter(lambda p: p[0] == p[1]).count() / float(test_data_chi.count())
tt = time() - t0

print("Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)))

Prediction made in 20.386 seconds. Test accuracy is 0.872
