# Decision Trees

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/14 15:20:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Loading the data

In [2]:
data = spark.read.option("inferSchema", True).option("header", False).csv('covtype.data')
data.printSchema()

[Stage 1:>                                                        (0 + 18) / 18]

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: integer (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _

                                                                                

In [3]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, IntegerType

colnames = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_noon", "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points"] + \
           [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)] + ["Cover_Type"]

data = data.toDF(*colnames)
data = data.withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))
for name in colnames[:-1]:
    data = data.withColumn(name, col(name).cast(IntegerType()))
data = data.na.drop()
data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nullable = true)
 |-- Soil_Type

In [4]:
train_data, test_data = data.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

24/09/14 15:20:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[Elevation: int, Aspect: int, Slope: int, Horizontal_Distance_To_Hydrology: int, Vertical_Distance_To_Hydrology: int, Horizontal_Distance_To_Roadways: int, Hillshade_9am: int, Hillshade_noon: int, Hillshade_3pm: int, Horizontal_Distance_To_Fire_Points: int, Wilderness_Area_0: int, Wilderness_Area_1: int, Wilderness_Area_2: int, Wilderness_Area_3: int, Soil_Type_0: int, Soil_Type_1: int, Soil_Type_2: int, Soil_Type_3: int, Soil_Type_4: int, Soil_Type_5: int, Soil_Type_6: int, Soil_Type_7: int, Soil_Type_8: int, Soil_Type_9: int, Soil_Type_10: int, Soil_Type_11: int, Soil_Type_12: int, Soil_Type_13: int, Soil_Type_14: int, Soil_Type_15: int, Soil_Type_16: int, Soil_Type_17: int, Soil_Type_18: int, Soil_Type_19: int, Soil_Type_20: int, Soil_Type_21: int, Soil_Type_22: int, Soil_Type_23: int, Soil_Type_24: int, Soil_Type_25: int, Soil_Type_26: int, Soil_Type_27: int, Soil_Type_28: int, Soil_Type_29: int, Soil_Type_30: int, Soil_Type_31: int, Soil_Type_32: int, Soil_Type_33: int, S

In [5]:
from pyspark.ml.feature import VectorAssembler
input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
assembled_train_data = vector_assembler.transform(train_data)
assembled_train_data.select("featureVector").show(truncate = False)



+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175.0,195.0,224.0,168.0,732.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1898.0,34.0,23.0,175.0,56.0,134.0,210.0,184.0,99.0,765.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1899.0,355.0,22.0,153.0,43.0,1

                                                                                

Fiiting the model and displaying the rules learnt by the model to make classifications based on different features

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type", featuresCol="featureVector",
                                    predictionCol="prediction")
model = classifier.fit(assembled_train_data)
print(model.toDebugString)



DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0f5b535f83ce, depth=5, numNodes=39, numClasses=8, numFeatures=54
  If (feature 0 <= 3047.5)
   If (feature 0 <= 2499.5)
    If (feature 3 <= 15.0)
     If (feature 12 <= 0.5)
      If (feature 23 <= 0.5)
       Predict: 4.0
      Else (feature 23 > 0.5)
       Predict: 3.0
     Else (feature 12 > 0.5)
      Predict: 6.0
    Else (feature 3 > 15.0)
     If (feature 16 <= 0.5)
      Predict: 3.0
     Else (feature 16 > 0.5)
      If (feature 9 <= 1318.5)
       Predict: 3.0
      Else (feature 9 > 1318.5)
       Predict: 4.0
   Else (feature 0 > 2499.5)
    If (feature 17 <= 0.5)
     If (feature 15 <= 0.5)
      Predict: 2.0
     Else (feature 15 > 0.5)
      Predict: 3.0
    Else (feature 17 > 0.5)
     If (feature 0 <= 2711.5)
      Predict: 3.0
     Else (feature 0 > 2711.5)
      If (feature 5 <= 1228.0)
       Predict: 5.0
      Else (feature 5 > 1228.0)
       Predict: 2.0
  Else (feature 0 > 3047.5)
   If (feature 0 <= 3

Displaying the relative importance of each feature in the model

In [7]:
import pandas as pd
pd.DataFrame(model.featureImportances.toArray(),index=input_cols, columns=['importance']).\
    sort_values(by="importance", ascending=False)

  from pandas.core import (


Unnamed: 0,importance
Elevation,0.830005
Soil_Type_3,0.037493
Soil_Type_1,0.032358
Hillshade_noon,0.027133
Horizontal_Distance_To_Hydrology,0.023537
Soil_Type_31,0.018429
Wilderness_Area_2,0.01643
Horizontal_Distance_To_Roadways,0.004736
Soil_Type_2,0.003698
Hillshade_9am,0.002741


Displaying performance metrics

In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(assembled_train_data)
predictions.select("Cover_Type", "prediction", "probability").show(10, truncate=False)

evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
print("Accuracy:", accuracy)

precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
print("Precision:", precision)

recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
print("Recall:", recall)

f1 = evaluator.setMetricName("f1").evaluate(predictions)
print("F1 Score: ",f1)

+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                          |
+----------+----------+-------------------------------------------------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,3.1514922315716493E-5,0.06759950836721187,0.6085216349941698,0.020547729349847152,0.0014496864265229586,0.30184992593993254,0.0]|
|6.0       |4.0       |[0.0,0.0,0.03747397640527411,0.238029146426093,0.6252602359472589,0.006939625260235947,0.0922970159611381,0.0]                       |
|6.0       |3.0       |[0.0,3.1514922315716493E-5,0.06759950836721187,0.6085216349941698,0.020547729349847152,0.0014496864265229586,0.30184992593993254,0.0]|
|6.0       |3.0       |[0.0,3.1514922315716493E-5,0.

                                                                                

Accuracy: 0.7022789084464002




Precision: 0.7005754082541524
Recall: 0.7022789084464002
F1 Score:  0.686712589094265


Displaying the confusion matrix

In [9]:
confusion_matrix = predictions.groupBy("Cover_Type").\
pivot("prediction", range(1,8)).count().\
na.fill(0.0).\
orderBy("Cover_Type")
confusion_matrix.show()

                                                                                

+----------+------+------+-----+----+---+---+-----+
|Cover_Type|     1|     2|    3|   4|  5|  6|    7|
+----------+------+------+-----+----+---+---+-----+
|       1.0|126362| 58949|  104|   0| 22|  6| 5392|
|       2.0| 49366|200954| 3685|  54|344| 55|  797|
|       3.0|     0|  3902|27877| 353| 20|119|    0|
|       4.0|     0|     3| 1327|1165|  0|  0|    0|
|       5.0|     0|  7822|  315|  10|431|  0|    0|
|       6.0|     0|  4423|10547| 133|  9|506|    0|
|       7.0|  7884|   241|    0|   0|  0|  0|10407|
+----------+------+------+-----+----+---+---+-----+



# Tuning HyperParameters

In [10]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder(). \
addGrid(classifier.impurity, ["gini", "entropy"]). \
addGrid(classifier.maxDepth, [1, 20]). \
addGrid(classifier.maxBins, [40, 300]). \
addGrid(classifier.minInfoGain, [0.0, 0.05]). \
build()
multiclassEval = MulticlassClassificationEvaluator(). \
setLabelCol("Cover_Type"). \
setPredictionCol("prediction"). \
setMetricName("accuracy")

In [16]:
param_grid_list = []
for param_map in paramGrid:
    param_grid_list.append(dict(param_map))

param_grid_df = pd.DataFrame(param_grid_list)

print("Parameter Grid:")
print(param_grid_df.to_string(index=False))


Parameter Grid:
DecisionTreeClassifier_0f5b535f83ce__impurity  DecisionTreeClassifier_0f5b535f83ce__maxDepth  DecisionTreeClassifier_0f5b535f83ce__maxBins  DecisionTreeClassifier_0f5b535f83ce__minInfoGain
                                         gini                                              1                                            40                                              0.00
                                         gini                                              1                                            40                                              0.05
                                         gini                                              1                                           300                                              0.00
                                         gini                                              1                                           300                                              0.05
                                       

In [13]:
# Display the settings of the evaluator
print(f"Label Column: {multiclassEval.getLabelCol()}")
print(f"Prediction Column: {multiclassEval.getPredictionCol()}")
print(f"Metric Name: {multiclassEval.getMetricName()}")


Label Column: Cover_Type
Prediction Column: prediction
Metric Name: accuracy
