# Decision Tree Model with Discrete Values (Classifier)
In this example, we are using Spark MLlib library.


In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, Row
from numpy import array

In [2]:
spark = SparkSession.builder.appName("decision-app").config("spark.config.option", "value").getOrCreate()
scfg = SparkConf().setAppName('decision-app')
sc = spark.sparkContext

In [3]:
# converting Y/N into a binary 0 or 1.
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

In [4]:
# converting a degree into a number 0-3
def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

In [5]:
# It is necessary to map datapoints into integer fields ...
def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed, previousEmployers, educationLevel, topTier, interned]))

In [6]:
# Loading input / training file / transform into integer values
input_file = "/user/student/PastHires.csv"
raw_data = sc.textFile(input_file)
header = raw_data.first()
raw_data = raw_data.filter(lambda x:x != header)
csv_data = raw_data.map(lambda x: x.split(','))
raw_data.collect()

['10,Y,4,BS,N,N,Y',
 '0,N,0,BS,Y,Y,Y',
 '7,N,6,BS,N,N,N',
 '2,Y,1,MS,Y,N,Y',
 '20,N,2,PhD,Y,N,N',
 '0,N,0,PhD,Y,Y,Y',
 '5,Y,2,MS,N,Y,Y',
 '3,N,1,BS,N,Y,Y',
 '15,Y,5,BS,N,N,Y',
 '0,N,0,BS,N,N,N',
 '1,N,1,PhD,Y,N,N',
 '4,Y,1,BS,N,Y,Y',
 '0,N,0,PhD,Y,N,Y']

In [7]:
# Making training data
training_data = csv_data.map(createLabeledPoints)

In [15]:
test_candidates = [ 
    array([10, 1, 3, 1, 0, 0]), 
    array([20, 0, 2, 3, 1, 0]), 
    array([39, 1, 2, 1, 0, 0]),
    array([5, 1, 3, 1, 0, 1]),
    array([1, 0, 1, 0, 1, 1]),
    array([0, 1, 3, 0, 1, 1]),
    array([29, 1, 3, 0, 1, 0]),
    array([10, 0, 3, 0, 1, 1]),
    array([40, 1, 2, 0, 1, 0]),
    array([0, 1, 3, 1, 1, 1])
]
# test_candidates = [ array([20, 0, 2, 3, 1, 0])]
test_data = sc.parallelize(test_candidates)

In [16]:
# decision tree learning model: statistics, data mining & machine learning
# gini diversity index
model = DecisionTree.trainClassifier(training_data, numClasses=2, \
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, \
                                     impurity='gini', maxDepth=5, maxBins=32)

In [17]:
predictions = model.predict(test_data)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

Hire prediction:
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [18]:
print('Learned classification tree model:')
print(model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 9 nodes
  If (feature 1 in {0.0})
   If (feature 5 in {0.0})
    If (feature 0 <= 0.5)
     If (feature 3 in {1.0})
      Predict: 0.0
     Else (feature 3 not in {1.0})
      Predict: 1.0
    Else (feature 0 > 0.5)
     Predict: 0.0
   Else (feature 5 not in {0.0})
    Predict: 1.0
  Else (feature 1 not in {0.0})
   Predict: 1.0

