<a href="https://colab.research.google.com/github/venu-analytics/Analytics-Projects/blob/main/Session10_Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialisation and Loading the data

In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz
!tar xf spark-2.2.1-bin-hadoop2.7.tgz
!pip install -q findspark

Get:1 http://security.ubuntu.com/ubuntu artful-security InRelease [83.2 kB]
Hit:2 http://archive.ubuntu.com/ubuntu artful InRelease
Get:3 http://archive.ubuntu.com/ubuntu artful-updates InRelease [88.7 kB]
Get:4 http://archive.ubuntu.com/ubuntu artful-backports InRelease [74.6 kB]
Get:5 http://security.ubuntu.com/ubuntu artful-security/universe Sources [21.5 kB]
Get:6 http://security.ubuntu.com/ubuntu artful-security/universe amd64 Packages [84.3 kB]
Get:7 http://security.ubuntu.com/ubuntu artful-security/main amd64 Packages [237 kB]
Get:8 http://archive.ubuntu.com/ubuntu artful-updates/universe Sources [46.2 kB]
Get:9 http://archive.ubuntu.com/ubuntu artful-updates/universe amd64 Packages [150 kB]
Get:10 http://archive.ubuntu.com/ubuntu artful-updates/main amd64 Packages [365 kB]
Fetched 1,150 kB in 1s (819 kB/s)
Reading package lists... Done


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.2.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
sc = spark.sparkContext
sc

In [None]:
import urllib
f = urllib.request.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

In [None]:
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

print ("Train data size is {}".format(raw_data.count()))

Train data size is 494021


In [None]:
ft = urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz")

In [None]:
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

print ("Test data size is {}".format(test_raw_data.count()))

Test data size is 311029


## Preparing the data

In [None]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_raw_data.map(lambda x: x.split(","))

protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()

In [None]:
def create_labeled_point(line_split):
    # leave_out = [41]
    clean_line_split = line_split[0:41]
    
    # convert protocol to numeric categorical variable
    try: 
        clean_line_split[1] = protocols.index(clean_line_split[1])
    except:
        clean_line_split[1] = len(protocols)
        
    # convert service to numeric categorical variable
    try:
        clean_line_split[2] = services.index(clean_line_split[2])
    except:
        clean_line_split[2] = len(services)
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[3] = flags.index(clean_line_split[3])
    except:
        clean_line_split[3] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

In [None]:
training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

## Training a classifier

In [None]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, 
                                          categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
                                          impurity='gini', maxDepth=4, maxBins=100)
tt = time() - t0

print ("Classifier trained in ",round(tt,3)," seconds")

Classifier trained in  17.969  seconds


## Evaluating the model

In [None]:
predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

In [None]:
t0 = time()
test_accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
tt = time() - t0

print ("Prediction made in ",round(tt,3)," seconds. Test accuracy is",round(test_accuracy,4))

Prediction made in  17.571  seconds. Test accuracy is 0.9186


## Interpreting the model

In [None]:
print ("Learned classification tree model:")
print (tree_model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 27 nodes
  If (feature 22 <= 68.0)
   If (feature 25 <= 0.67)
    If (feature 9 <= 0.0)
     If (feature 36 <= 0.43)
      Predict: 0.0
     Else (feature 36 > 0.43)
      Predict: 1.0
    Else (feature 9 > 0.0)
     If (feature 4 <= 1091.0)
      Predict: 0.0
     Else (feature 4 > 1091.0)
      Predict: 1.0
   Else (feature 25 > 0.67)
    If (feature 3 in {0.0,5.0,1.0,2.0,3.0})
     If (feature 2 in {0.0,5.0,10.0,1.0,6.0,2.0,30.0,4.0,15.0})
      Predict: 0.0
     Else (feature 2 not in {0.0,5.0,10.0,1.0,6.0,2.0,30.0,4.0,15.0})
      Predict: 1.0
    Else (feature 3 not in {0.0,5.0,1.0,2.0,3.0})
     If (feature 38 <= 0.06)
      Predict: 0.0
     Else (feature 38 > 0.06)
      Predict: 1.0
  Else (feature 22 > 68.0)
   If (feature 5 <= 0.0)
    If (feature 11 <= 0.0)
     Predict: 1.0
    Else (feature 11 > 0.0)
     Predict: 0.0
   Else (feature 5 > 0.0)
    If (feature 2 in {0.0,10.0,1.0,3.0,23.0})
   

In [None]:
print ("Service 0 is {}".format(services[0]))
print ("Service 52 is {}".format(services[52]))

Service 0 is http
Service 52 is netbios_dgm


## Building a minimal model using the three main splits

In [None]:
def create_labeled_point_minimal(line_split):
    # leave_out = [41]
    clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23]
    
    # convert flag to numeric categorical variable
    try:
        clean_line_split[0] = flags.index(clean_line_split[0])
    except:
        clean_line_split[0] = len(flags)
    
    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0
        
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

In [None]:
training_data_minimal = csv_data.map(create_labeled_point_minimal)
test_data_minimal = test_csv_data.map(create_labeled_point_minimal)

In [None]:
# Build the model
t0 = time()
tree_model_minimal = DecisionTree.trainClassifier(training_data_minimal, numClasses=2, 
                                          categoricalFeaturesInfo={0: len(flags)},
                                          impurity='gini', maxDepth=3, maxBins=32)
tt = time() - t0

print ("Classifier trained in ",round(tt,3)," seconds")

Classifier trained in  8.042  seconds


In [None]:
predictions_minimal = tree_model_minimal.predict(test_data_minimal.map(lambda p: p.features))
labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal)

In [None]:
t0 = time()
test_accuracy = labels_and_preds_minimal.filter(lambda x: x[0] == x[1]).count() / float(test_data_minimal.count())
tt = time() - t0

print ("Prediction made in ",round(tt,3)," seconds. Test accuracy is ",round(test_accuracy,4))

Prediction made in  8.889  seconds. Test accuracy is  0.9143
