In [1]:
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating
from numpy import array

In [2]:
!ls

aas			     ML KNN.ipynb
boston-airbnb-geo.ipynb      notebooks
boston-airbnb-open-data.zip  Pandas - geospatial plot.ipynb
calendar.csv		     passwd
derby.log		     reviews.csv
Desktop			     Spark Kmeans Demo.ipynb
ds-20150527-164028	     Spark ML test.ipynb
ds-20150527-164028.tar.gz    Spark practice and ML tests.ipynb
Folium Test.ipynb	     Spark Works.ipynb
kddcup			     taxidata
kddcup.data_10_percent.gz    tmp
KDDCup Example.ipynb	     uber-raw-data-noheader.csv
kddcup Kmeans.ipynb	     uber-tlc-foil-response
listings.csv


In [3]:
#remove existing copies of dataset from HDFS
#!hdfs dfs -rm  /tmp/expenses.csv

#fetch the dataset
#!wget https://data.gov.au/dataset/f84b9baf-c1c1-437c-8c1e-654b2829848c/resource/88399d53-d55c-466c-8f4a-6cb965d24d6d/download/healthexpenditurebyareaandsource.csv -O /tmp/expenses.csv

#remove header
#!sed -i '1d' /tmp/expenses.csv
#remove empty fields
#!sed -i "s/,,,,,//g" /tmp/expenses.csv
#!sed -i '/^\s*$/d' /tmp/expenses.csv

#put data into HDFS
#!hdfs dfs -put /tmp/expenses.csv /tmp
!hdfs dfs -ls -h /tmp/expenses.csv
#!rm /tmp/expenses.csv

-rw-r--r--   3 cloud-user hdfs    456.4 K 2017-01-13 07:32 /tmp/expenses.csv


In [4]:
dataset=sc.textFile("/tmp/expenses.csv")

In [5]:
dataset.count()

6778

In [6]:
dataset.first()

u'1997-98,NSW,Administration,Government,Australian Government,315'

In [7]:
health = dataset.map(lambda l: l.split(',')).map(lambda row: (row[0],row[1],row[2],row[3],row[4],int(row[5])))

In [8]:
health.take(3)

[(u'1997-98',
  u'NSW',
  u'Administration',
  u'Government',
  u'Australian Government',
  315),
 (u'1997-98',
  u'NSW',
  u'Administration',
  u'Government',
  u'State and local',
  120),
 (u'1997-98',
  u'NSW',
  u'Administration',
  u'Non-government',
  u'Private health insurance funds',
  314)]

In [9]:
hdf = health.toDF(["year","state","category","funding_src1","funding_scr2","spending"])

In [10]:
from pyspark.sql import HiveContext, Row
sqlContext = HiveContext(sc)

In [11]:
hdf.show()

+-------+-----+--------------------+--------------+--------------------+--------+
|   year|state|            category|  funding_src1|        funding_scr2|spending|
+-------+-----+--------------------+--------------+--------------------+--------+
|1997-98|  NSW|      Administration|    Government|Australian Govern...|     315|
|1997-98|  NSW|      Administration|    Government|     State and local|     120|
|1997-98|  NSW|      Administration|Non-government|Private health in...|     314|
|1997-98|  NSW| Aids and appliances|    Government|Australian Govern...|      65|
|1997-98|  NSW| Aids and appliances|Non-government|         Individuals|     168|
|1997-98|  NSW| Aids and appliances|Non-government|Other non-government|      18|
|1997-98|  NSW| Aids and appliances|Non-government|Private health in...|      78|
|1997-98|  NSW|All other medicat...|    Government|Australian Govern...|       5|
|1997-98|  NSW|All other medicat...|Non-government|         Individuals|     559|
|1997-98|  NSW|A

In [12]:
hdf.printSchema()

root
 |-- year: string (nullable = true)
 |-- state: string (nullable = true)
 |-- category: string (nullable = true)
 |-- funding_src1: string (nullable = true)
 |-- funding_scr2: string (nullable = true)
 |-- spending: long (nullable = true)



In [13]:
hdf.registerTempTable("healthtable")

In [14]:
sqlContext.sql("SHOW TABLES").show()

+---------+-----------+
|tableName|isTemporary|
+---------+-----------+
+---------+-----------+



In [15]:
hdf.select("category").take(10)

[Row(category=u'Administration'),
 Row(category=u'Administration'),
 Row(category=u'Administration'),
 Row(category=u'Aids and appliances'),
 Row(category=u'Aids and appliances'),
 Row(category=u'Aids and appliances'),
 Row(category=u'Aids and appliances'),
 Row(category=u'All other medications'),
 Row(category=u'All other medications'),
 Row(category=u'All other medications')]

In [18]:
sc.version

u'1.6.1'

Training Error = 0.366459627329


In [3]:
import urllib
#f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")
#put to hdfs or read local file

data_file = "/tmp/kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

In [4]:
raw_data.take(1)

[u'0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.']

## Detecting network attacks using Logistic Regression

In [37]:
data_file = "/tmp/kddcup.data.gz"
raw_data = sc.textFile(data_file)

print "Train data size is {}".format(raw_data.count())


Train data size is 4898431


In [38]:
test_data_file = "/tmp/corrected.gz"
test_raw_data = sc.textFile(test_data_file)

print "Test data size is {}".format(test_raw_data.count())


Test data size is 311029


In [40]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

def parse_interaction(line):
    line_split = line.split(",")
    # leave_out = [1,2,3,41]
    clean_line_split = line_split[0:1]+line_split[4:41]
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = raw_data.map(parse_interaction)
test_data = test_raw_data.map(parse_interaction)


In [41]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from time import time

# Build the model
t0 = time()
logit_model = LogisticRegressionWithLBFGS.train(training_data)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))


Classifier trained in 570.635 seconds


##  Local vectors

source : https://www.codementor.io/spark/tutorial/mllib-basic-statistics-exploratory-data-analysis

A local vector is often used as a base type for RDDs in Spark MLlib. A local vector has integer-typed and 0-based indices and double-typed values, stored on a single machine. MLlib supports two types of local vectors: dense and sparse. A dense vector is backed by a double array representing its entry values, while a sparse vector is backed by two parallel arrays: indices and values.

For dense vectors, MLlib uses either Python lists or the NumPy array type. The later is recommended, so you can simply pass NumPy arrays around.

For sparse vectors, users can construct a SparseVector object from MLlib or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. The easiest way to create sparse vectors is to use the factory methods implemented in Vectors.

In [17]:
## An RDD of Dense vectors

In [18]:
import numpy as np

data_file = "/tmp/kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

def parse_interaction(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return np.array([float(x) for x in clean_line_split])

vector_data = raw_data.map(parse_interaction)

In [19]:
#vector_data.take(1)

In [20]:
from pyspark.mllib.stat import Statistics 
from math import sqrt 

# Compute column summary statistics.
summary = Statistics.colStats(vector_data)

print "Duration Statistics:"
print " Mean: {}".format(round(summary.mean()[0],3))
print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
print " Max value: {}".format(round(summary.max()[0],3))
print " Min value: {}".format(round(summary.min()[0],3))
print " Total value count: {}".format(summary.count())
print " Number of non-zero values: {}".format(summary.numNonzeros()[0])

Duration Statistics:
 Mean: 47.979
 St. deviation: 707.746
 Max value: 58329.0
 Min value: 0.0
 Total value count: 494021
 Number of non-zero values: 12350.0


In [21]:
def parse_interaction_with_key(line):
    line_split = line.split(",")
    # keep just numeric and logical values
    symbolic_indexes = [1,2,3,41]
    clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
    return (line_split[41], np.array([float(x) for x in clean_line_split]))

label_vector_data = raw_data.map(parse_interaction_with_key)


In [22]:
normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")
normal_summary = Statistics.colStats(normal_label_data.values())

In [23]:
print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_summary.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
print " Max value: {}".format(round(normal_summary.max()[0],3))
print " Min value: {}".format(round(normal_summary.min()[0],3))
print " Total value count: {}".format(normal_summary.count())
print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])


Duration Statistics for label: normal
 Mean: 216.657322313
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [24]:
def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
    return Statistics.colStats(label_vector_data.values())


In [25]:
normal_sum = summary_by_label(raw_data, "normal.")

print "Duration Statistics for label: {}".format("normal")
print " Mean: {}".format(normal_sum.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
print " Max value: {}".format(round(normal_sum.max()[0],3))
print " Min value: {}".format(round(normal_sum.min()[0],3))
print " Total value count: {}".format(normal_sum.count())
print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])


Duration Statistics for label: normal
 Mean: 216.657322313
 St. deviation: 1359.213
 Max value: 58329.0
 Min value: 0.0
 Total value count: 97278
 Number of non-zero values: 11690.0


In [31]:
guess_passwd_summary = summary_by_label(raw_data, "guess_passwd.")

print "Duration Statistics for label: {}".format("guess_password")
print " Mean: {}".format(guess_passwd_summary.mean()[0],3)
print " St. deviation: {}".format(round(sqrt(guess_passwd_summary.variance()[0]),3))
print " Max value: {}".format(round(guess_passwd_summary.max()[0],3))
print " Min value: {}".format(round(guess_passwd_summary.min()[0],3))
print " Total value count: {}".format(guess_passwd_summary.count())
print " Number of non-zero values: {}".format(guess_passwd_summary.numNonzeros()[0])


Duration Statistics for label: guess_password
 Mean: 2.71698113208
 St. deviation: 11.88
 Max value: 60.0
 Min value: 0.0
 Total value count: 53
 Number of non-zero values: 4.0


In [34]:
label_list = ["back.","buffer_overflow.","ftp_write.",
              "guess_passwd.","imap.","ipsweep.",
              "land.","loadmodule.","multihop.",
              "neptune.","nmap.","normal.","perl.",
              "phf.","pod.","portsweep.",
              "rootkit.","satan.","smurf.","spy.",
              "teardrop.","warezclient.",
              "warezmaster."]

stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]



duration_by_label = [ 
    (stat[0], 
     np.array([
         float(stat[1].mean()[0]), 
         float(sqrt(stat[1].variance()[0])), 
         float(stat[1].min()[0]), 
         float(stat[1].max()[0]), 
         int(stat[1].count())])) 
    for stat in stats_by_label]

KeyboardInterrupt: 

In [29]:
import pandas as pd
pd.set_option('display.max_columns', 50)

stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

stats_by_label_df

NameError: name 'duration_by_label' is not defined

In [None]:
def get_variable_stats_df(stats_by_label, column_i):
    column_stats_by_label = [
        (stat[0], 
         np.array([
             float(stat[1].mean()[column_i]), 
             float(sqrt(stat[1].variance()[column_i])), 
             float(stat[1].min()[column_i]), 
             float(stat[1].max()[column_i]), 
             int(stat[1].count())])) 
        for stat in stats_by_label
    ]
    return pd.DataFrame.from_items(
        column_stats_by_label, 
        columns=["Mean", "Std Dev", "Min", "Max", "Count"], 
        orient='index')


In [None]:
print "duration statistics, by label"
get_variable_stats_df(stats_by_label,0)

In [None]:
print "src_bytes statistics, by label"
get_variable_stats_df(stats_by_label,1)

In [None]:
from pyspark.mllib.stat import Statistics 
correlation_matrix = Statistics.corr(vector_data, method="spearman")


In [None]:
import pandas as pd
pd.set_option('display.max_columns', 50)

col_names = ["duration","src_bytes","dst_bytes",
             "land","wrong_fragment",
             "urgent","hot","num_failed_logins",
             "logged_in","num_compromised",
             "root_shell","su_attempted",
             "num_root","num_file_creations",
             "num_shells","num_access_files",
             "num_outbound_cmds",
             "is_hot_login","is_guest_login","count",
             "srv_count","serror_rate",
             "srv_serror_rate","rerror_rate",
             "srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate",
             "dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate",
             "dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate",
             "dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate"]

corr_df = pd.DataFrame(
                    correlation_matrix, 
                    index=col_names, 
                    columns=col_names)

corr_df

In [None]:
# get a boolean dataframe where true means that 
# a pair of variables is highly correlated
highly_correlated_df = (abs(corr_df) > .8) & (corr_df < 1.0)

# get the names of the variables so we can use 
# them to slice the dataframe
correlated_vars_index = (highly_correlated_df==True).any()
correlated_var_names = correlated_vars_index[correlated_vars_index==True].index

# slice it
highly_correlated_df.loc[correlated_var_names,correlated_var_names]
