# Decision Tree Model with Discrete Values (Classifier)
In this example, we are using Spark MLlib library.


In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array

In [None]:
# converting Y/N into a binary 0 or 1.
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

In [None]:
# converting a degree into a number 0-3
def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

In [None]:
# It is necessary to map datapoints into integer fields ...
def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed, previousEmployers, educationLevel, topTier, interned]))

In [None]:
# Loading input / training file / transform into integer values
input_file = "/user/student/PastHires.csv"
raw_data = sc.textFile(input_file)
header = raw_data.first()
raw_data = raw_data.filter(lambda x:x != header)
csv_data = raw_data.map(lambda x: x.split(','))
raw_data.collect()

In [None]:
# Making training data
training_data = csv_data.map(createLabeledPoints)

In [None]:
test_candidates = [ array([10, 1, 3, 1, 0, 0])]
# test_candidates = [ array([20, 0, 2, 3, 1, 0])]
test_data = sc.parallelize(test_candidates)

In [None]:
# decision tree learning model: statistics, data mining & machine learning
# gini diversity index
model = DecisionTree.trainClassifier(training_data, numClasses=2, \
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, \
                                     impurity='gini', maxDepth=5, maxBins=32)

In [None]:
predictions = model.predict(test_data)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

In [None]:
print('Learned classification tree model:')
print(model.toDebugString())