# PySpark MLlib 

This notebook is based on https://github.com/jadianes/spark-py-notebooks

MLlib is Spark's machine learning library.  

## Getting the data

In [1]:
import urllib.request
f = urllib.request.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

## Create a PySpark session

In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('MLlib-PySpark').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

## Create RDD

In [3]:
data_file = './kddcup.data_10_percent.gz'
raw_data = sc.textFile(data_file)

## Local vectors

A local vector is often used as a base type for RDDs in Spark MLlib. A local vector has integer-typed and 0-based indices and double-typed values, stored on a single machine. MLlib supports two types of local vectors: dense and sparse. A dense vector is backed by a double array representing its entry value, while a sparse vector is backed by two parallel arrays: indices and values.

- For dense vectors, MLlib uses either Python list or the Numpy array type. The later is recommended, so you can simply pass NumPy arrays around.

- For sparse vectors, users can construct a SparseVector object from MLlib or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. The easiest way to create sparse vectors is to use the factory methods implemented in Vectors.

### An RDD of dense vectors

Let's represent each network interaction in our dataset as a dense vector. For that we will use the NumPy array type.

In [4]:
import numpy as np

def parse_interaction(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return np.array([float(x) for x in clean_line_split])

vector_data = raw_data.map(parse_interaction)

In [5]:
# Summary statistics
from pyspark.mllib.stat import Statistics 

# Compute column summary statistics.
summary = Statistics.colStats(vector_data)

print('Duration statistics:')
print('Mean: {}'.format(round(summary.mean()[0], 3)))

Duration statistics:
Mean: 47.979


In [8]:
from math import sqrt
def parse_interaction_with_key(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return (line_split[41], np.array([float(x) for x in clean_line_split]))

label_vector_data = raw_data.map(parse_interaction_with_key)

def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
    return Statistics.colStats(label_vector_data.values())

label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
              "imap.","ipsweep.","land.","loadmodule.","multihop.",
              "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
              "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
              "warezmaster."]

# get a list of statistics for each label
stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]

# get the duration column

duration_by_label = [(stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())])) 
    for stat in stats_by_label]

In [10]:
duration_by_label

[('back.',
  array([1.28915116e-01, 1.11006217e+00, 0.00000000e+00, 1.40000000e+01,
         2.20300000e+03])),
 ('buffer_overflow.',
  array([ 91.7       ,  97.51468501,   0.        , 321.        ,
          30.        ])),
 ('ftp_write.',
  array([ 32.375     ,  47.44903281,   0.        , 134.        ,
           8.        ])),
 ('guess_passwd.',
  array([ 2.71698113, 11.87981054,  0.        , 60.        , 53.        ])),
 ('imap.',
  array([ 6.       , 14.1742404,  0.       , 41.       , 12.       ])),
 ('ipsweep.',
  array([3.44827586e-02, 4.38439193e-01, 0.00000000e+00, 7.00000000e+00,
         1.24700000e+03])),
 ('land.', array([ 0.,  0.,  0.,  0., 21.])),
 ('loadmodule.',
  array([ 36.22222222,  41.40886915,   0.        , 103.        ,
           9.        ])),
 ('multihop.',
  array([184.        , 253.85100617,   0.        , 718.        ,
           7.        ])),
 ('neptune.', array([     0.,      0.,      0.,      0., 107201.])),
 ('nmap.', array([  0.,   0.,   0.,   0., 231

### Correlations
Spark's MLlib supports Pearson’s and Spearman’s to calculate pairwise correlation methods among many series. Both of them are provided by the corr method in the Statistics package.

In [11]:
from pyspark.mllib.stat import Statistics 
correlation_matrix = Statistics.corr(vector_data, method="spearman")

import pandas as pd
pd.set_option('display.max_columns', 50)

col_names = ["duration","src_bytes","dst_bytes","land","wrong_fragment",
             "urgent","hot","num_failed_logins","logged_in","num_compromised",
             "root_shell","su_attempted","num_root","num_file_creations",
             "num_shells","num_access_files","num_outbound_cmds",
             "is_hot_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

corr_df = pd.DataFrame(correlation_matrix, index=col_names, columns=col_names)

corr_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_hot_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
duration,1.0,0.014196,0.299189,-0.001068,-0.008025,0.017884,0.108639,0.014363,0.159564,0.010687,0.040425,0.026012,0.013401,0.061099,0.008633,0.019407,,,0.205607,-0.259032,-0.250139,-0.074211,-0.073663,-0.025936,-0.02642,0.062291,-0.050875,0.123621,-0.161107,-0.217167,-0.211979,0.231644,-0.065202,0.100692,-0.056753,-0.057298,-0.007759,-0.013891
src_bytes,0.014196,1.0,-0.167931,-0.009404,-0.019358,9.4e-05,0.11392,-0.008396,-0.089702,0.118562,0.003067,0.002282,-0.00205,0.02771,0.014403,-0.001497,,,0.027511,0.66623,0.722609,-0.65746,-0.652391,-0.34218,-0.332977,0.744046,-0.739988,-0.104042,0.130377,0.741979,0.729151,-0.712965,0.815039,-0.140231,-0.645919,-0.641792,-0.297338,-0.300581
dst_bytes,0.299189,-0.167931,1.0,-0.00304,-0.022659,0.007234,0.193156,0.021952,0.882185,0.169772,0.026054,0.012191,-0.003884,0.034154,-5.5e-05,0.065775,,,0.085947,-0.639157,-0.497683,-0.205848,-0.198715,-0.100958,-0.081307,0.229677,-0.222572,0.521003,-0.611972,0.024124,0.055033,-0.035073,-0.396195,0.578557,-0.167047,-0.158378,-0.003042,0.001621
land,-0.001068,-0.009404,-0.00304,1.0,-0.000334,-1.9e-05,-0.000538,-7.5e-05,-0.002784,-0.000449,-7e-05,-3.3e-05,-0.00023,-0.000155,-6.8e-05,-0.000202,,,-0.000249,-0.010939,-0.010128,0.01416,0.014343,-0.000452,-0.001688,0.002153,-0.001846,0.02068,-0.019922,-0.012342,0.002574,-0.001803,0.004265,0.016173,0.013565,0.012264,0.000386,-0.00182
wrong_fragment,-0.008025,-0.019358,-0.022659,-0.000334,1.0,-0.000143,-0.004042,-0.000566,-0.020911,-0.003371,-0.000529,-0.000247,-0.001726,-0.001161,-0.000509,-0.00152,,,-0.001868,-0.057711,-0.029117,-0.00885,-0.023382,0.00043,-0.012676,0.010218,-0.009386,0.012117,-0.029149,-0.058225,-0.04956,0.055542,-0.015449,0.007306,0.010387,-0.024117,0.046655,-0.013666
urgent,0.017884,9.4e-05,0.007234,-1.9e-05,-0.000143,1.0,0.008596,0.062973,0.006821,0.031781,0.067394,-1.4e-05,0.061989,0.061373,-2.9e-05,0.023389,,,-0.000106,-0.00478,-0.004798,-0.001335,-0.001327,-0.000711,-0.00072,0.001524,-0.001526,-0.000781,-0.005898,-0.0057,-0.004081,0.00521,-0.001941,-0.000975,-0.001379,-0.001369,-0.000788,-0.000776
hot,0.108639,0.11392,0.193156,-0.000538,-0.004042,0.008596,1.0,0.112558,0.189126,0.811529,0.101986,-0.000397,0.003096,0.028693,0.009144,0.004223,,,0.463709,-0.120847,-0.114735,-0.035487,-0.034934,0.013468,0.052002,0.041342,-0.040555,0.032141,-0.074178,-0.01796,0.018783,-0.017198,-0.086998,-0.014141,-0.004706,-0.010721,0.199018,0.189142
num_failed_logins,0.014363,-0.008396,0.021952,-7.5e-05,-0.000566,0.062973,0.112558,1.0,-0.002189,0.004621,0.016873,0.072693,0.010047,0.015221,-0.000115,0.005573,,,-0.000421,-0.018024,-0.018027,-0.003674,-0.004027,0.035325,0.034879,0.005716,-0.005538,-0.003099,-0.028371,-0.015092,0.003003,-0.002961,-0.006617,-0.002585,0.014713,0.014914,0.032393,0.032151
logged_in,0.159564,-0.089702,0.882185,-0.002784,-0.020911,0.006821,0.189126,-0.002189,1.0,0.16119,0.025293,0.011814,0.082533,0.05553,0.024356,0.072697,,,0.089318,-0.578287,-0.438947,-0.187114,-0.180122,-0.091962,-0.072287,0.216969,-0.214019,0.503807,-0.682721,0.080352,0.114526,-0.093565,-0.359506,0.659078,-0.143283,-0.132474,0.007236,0.012979
num_compromised,0.010687,0.118562,0.169772,-0.000449,-0.003371,0.031781,0.811529,0.004621,0.16119,1.0,0.085552,0.04897,0.028557,0.031221,0.011261,0.006979,,,-0.002506,-0.097212,-0.091154,-0.030516,-0.030264,0.008573,0.054006,0.035253,-0.034953,0.036497,-0.041615,0.003465,0.038979,-0.039091,-0.078844,-0.020978,-0.005019,-0.004504,0.214115,0.217859
