Chapter 10. Faster decision making with machine learning andPySpark
====
### Mastering Large Datasets with Python by JT Wolohan 



### Decision Trees

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [22]:
def string_to_index(df, label):
     return StringIndexer(inputCol=label,
                          outputCol="i-"+label).fit(df) \
                                               .transform(df)

In [23]:
spark = SparkSession.builder \
           .master("local") \
           .appName("Decision Trees") \
           .getOrCreate()

In [24]:
df = spark.read.csv("../Ch10/mushrooms.data", header=True, inferSchema=True)

In [25]:
categories = ['cap-shape', 'cap-surface', 'cap-color']
df = reduce(string_to_index, categories, df)

In [26]:
df = VectorAssembler(inputCols=["i-cap-shape","i-cap-surface", "i-cap-color"],
                     outputCol="features").transform(df)

df = StringIndexer(inputCol='edible?', outputCol='label').fit(df).transform(df)

In [27]:
tree = DecisionTreeClassifier()
model = tree.fit(df)

In [28]:
print(model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_06513451d79b) of depth 5 with 29 nodes
  If (feature 1 in {2.0,3.0})
   If (feature 2 in {0.0,2.0,4.0,6.0,7.0})
    If (feature 2 in {0.0,2.0,7.0})
     If (feature 0 in {0.0,1.0,2.0,4.0})
      Predict: 0.0
     Else (feature 0 not in {0.0,1.0,2.0,4.0})
      Predict: 1.0
    Else (feature 2 not in {0.0,2.0,7.0})
     If (feature 2 in {6.0})
      Predict: 1.0
     Else (feature 2 not in {6.0})
      Predict: 0.0
   Else (feature 2 not in {0.0,2.0,4.0,6.0,7.0})
    If (feature 2 in {3.0})
     Predict: 1.0
    Else (feature 2 not in {3.0})
     Predict: 0.0
  Else (feature 1 not in {2.0,3.0})
   If (feature 0 in {3.0,5.0})
    If (feature 2 in {0.0,1.0,3.0})
     If (feature 0 in {5.0})
      Predict: 1.0
     Else (feature 0 not in {5.0})
      Predict: 0.0
    Else (feature 2 not in {0.0,1.0,3.0})
     If (feature 2 in {5.0,6.0})
      Predict: 1.0
     Else (feature 2 not in {5.0,6.0})
      If (feature 0 in {5.0})
       

In [29]:
bce = BinaryClassificationEvaluator()

auc = bce.evaluate(model.transform(df))
print("Decision Tree AUC: {:0.4f}".format(auc))

Decision Tree AUC: 0.6333


### Random Forests

In [30]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from functools import reduce
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [31]:
spark = SparkSession.builder \
           .master("local") \
           .appName("Random Forests") \
           .getOrCreate()

In [32]:
bce = BinaryClassificationEvaluator()
forest = RandomForestClassifier()
df = spark.read.csv("../Ch10/mushrooms.data", header=True, inferSchema=True)

In [33]:
categories = df.columns
categories.pop(categories.index('edible?'))
df = reduce(string_to_index, categories, df)
indexes = ["i-"+c for c in categories]

In [34]:
df = VectorAssembler(inputCols=indexes,
                     outputCol="features").transform(df)
df = StringIndexer(inputCol='edible?',
                   outputCol='label').fit(df).transform(df)

In [35]:
grid = ParamGridBuilder().addGrid(forest.maxDepth, [0, 2]).build()
cv = CrossValidator(estimator=forest, estimatorParamMaps=grid,
                        evaluator=bce,numFolds=10,
                        parallelism=4)
cv_model = cv.fit(df)

In [36]:
area_under_curve = bce.evaluate(cv_model.transform(df))
print("Random Forest AUC: {:0.4f}".format(area_under_curve))

Random Forest AUC: 0.9950


In [37]:
print(cv_model.bestModel.toDebugString)

RandomForestClassificationModel (uid=RandomForestClassifier_3715b1717fde) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 7 in {0.0})
     If (feature 11 in {0.0,2.0,3.0})
      Predict: 0.0
     Else (feature 11 not in {0.0,2.0,3.0})
      Predict: 1.0
    Else (feature 7 not in {0.0})
     If (feature 1 in {2.0,3.0})
      Predict: 0.0
     Else (feature 1 not in {2.0,3.0})
      Predict: 1.0
  Tree 1 (weight 1.0):
    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})
     If (feature 4 in {0.0,4.0,5.0})
      Predict: 0.0
     Else (feature 4 not in {0.0,4.0,5.0})
      Predict: 1.0
    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})
     If (feature 21 in {6.0})
      Predict: 0.0
     Else (feature 21 not in {6.0})
      Predict: 1.0
  Tree 2 (weight 1.0):
    If (feature 11 in {0.0,2.0,3.0})
     Predict: 0.0
    Else (feature 11 not in {0.0,2.0,3.0})
     If (feature 20 in {2.0,3.0})
      Predict: 0.0
     Else (feature 20 not in {2.0,3.0})
      Predict: 1.0
  Tree 

[Read for more? Go to chapter 11!](./Ch11_notebook.ipynb)