# PastHires

### Import libraries

In [3]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array
import requests

### Load Data

Load the data from DBFS and filter out the header line.

In [5]:
rawData = sc.textFile("/FileStore/tables/0xwtlnmf1469416481691/PastHires.csv")
print 'Raw data:' 
print rawData.take(5)
header = rawData.first()
rawData = rawData.filter(lambda x:x != header)
print 'Filtered raw data:'
print rawData.take(5)

### Data Cleaning

We can see that each line is a string containing values separated by commas.

Split each line into a list.

Use **map()** to apply the split() function to every value in the RDD

In [7]:
csvData = rawData.map(lambda x: x.split(","))

Convert the lists into **LabelPoints**,

in order to create the **training data set**, and convert non-numerical features into **numerical features**.

In [9]:
# Convert 'Y'/'N' into numerical features 1/0
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0
      
# Convert 'BS'/'MS'/'PhD' into 1/2/3
def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed, previousEmployers, educationLevel, topTier, interned]))

In [10]:
trainingData = csvData.map(createLabeledPoints)

Construct a **DecisionTree** model and train it.

In [12]:
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)

Create a fake dataset.

In [14]:
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

Predict.

In [16]:
prediction = model.predict(testData).collect()
print prediction