# Tree Methods (Documentation Example)

This is just a quick walkthrough of the Documentation's Example of Random Forest.

Remember, you can use tree methods for both regression and classification problems. 

In [5]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_methods_doc').getOrCreate()

from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [6]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("Datasets/sample_libsvm_data.txt")

22/05/31 00:27:20 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [7]:
# Let's get a better look at the data.
data.show()

data.printSchema()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [8]:
# Split the data into training and test sets (30% held out for testing).
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [9]:
# Create all three models. Note the number of trees. 
# The more trees you have, the more computation time. But this could also significantly increase accuracy. So there's a tradeoff. 
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)

In [10]:
# Train model. 
model_rf = rf.fit(trainingData)
model_dt = dt.fit(trainingData)

In [11]:
# Now let's do the transformation.
prediction_rf = model_rf.transform(testData)
prediction_dt = model_dt.transform(testData)

In [12]:
# Let's have a look at the first one. 
prediction_rf.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[124,125,126...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [17.0,3.0]|[0.85,0.15]|       0.0|
|  0.0|(692,[125,126,127...|   [14.0,6.0]|  [0.7,0.3]|       0.0|
|  0.0|(692,[126,127,128...|   [18.0,2.0]|  [0.9,0.1]|       0.0|
|  0.0|(692,[126,127,128...|   [20.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [18.0,2.0]|  [0.9,0.1]|       0.0|
|  0.0|(692,[152,153,154...|   [17.0,3.0]|[0.85,0.15]|       0.0|
|  0.0|(692,[154,155,156...|   [14.0,6.0]|  [0.7,0.3]|       0.0|
|  1.0|(692,[123,124,125...|   [0.0,20.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|   [0.0,20.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[124,125,126...|   [0.0,20.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [13]:
# Select example rows to display.
prediction_rf.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [14]:
# Select (prediction, true label) and compute test error.
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [15]:
# A test error of zero means that the model accuracy is at 100%. 
# In most cases this is unrealistic, but here it's correct due to the simple data used in the documentation.
accuracy = evaluator.evaluate(prediction_rf)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


## Gradient Boosted Trees

Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles of decision trees. More information about the spark.ml implementation can be found further in the section on [GBTs](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts). For more information on the algorithm itself, please see the [spark.mllib documentation on GBTs.](http://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts)

Luckily Spark makes very easy to use, basically just an import switch:

In [16]:
from pyspark.ml.classification import GBTClassifier

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("Datasets/sample_libsvm_data.txt")

# Split the data into training and test sets (30% held out for testing).
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

# Train model.  This also runs the indexers.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

22/05/31 00:27:35 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[129,130,131...|
+----------+-----+--------------------+
only showing top 5 rows



22/05/31 00:27:38 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/31 00:27:38 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [17]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.125


Let's move on to a more realistic example!