# Tree Methods - Documentation Examples

## Random Forest Example

In [1]:
# Initiate the Spark Session

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rf').getOrCreate()

In [2]:
# Library imports

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
# load the parse the data file, converting it to a data frame

data = spark.read.format('libsvm').load('resources/sample_libsvm_data.txt')

data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [5]:
# We will split the data into training and test
training_data, test_data = data.randomSplit([0.7, 0.3])

training_data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
# We will now train a random forest model
rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=20)

# Train model. This will also fit the indexers
model = rf.fit(training_data)

In [7]:
# Make predictions
predictions = model.transform(test_data)

predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [9]:
# Select example rows to display
predictions.select('prediction', 'label', 'features').show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[128,129,130...|
|       0.0|  0.0|(692,[150,151,152...|
|       0.0|  0.0|(692,[151,152,153...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.0|  0.0|(692,[154,155,156...|
|       0.0|  0.0|(692,[181,182,183...|
|       0.0|  0.0|(692,[234,235,237...|
|       0.0|  1.0|(692,[99,100,101,...|
|       1.0|  1.0|(692,[100,101,102...|
+----------+-----+--------------------+
only showing top 20 rows



In [10]:
# Now we will check for prediction, true label and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',
                                             metricName='accuracy')

accuracy = evaluator.evaluate(predictions)

print(f'Accuracy : {accuracy}')

Accuracy : 0.9767441860465116


In [11]:
# We can also get the releative feature importances
model.featureImportances

SparseVector(692, {205: 0.0037, 216: 0.0072, 244: 0.0357, 262: 0.0401, 273: 0.0366, 328: 0.045, 358: 0.0653, 378: 0.0459, 379: 0.053, 405: 0.05, 406: 0.0184, 407: 0.0462, 427: 0.0395, 434: 0.0163, 439: 0.0068, 461: 0.0467, 462: 0.1527, 490: 0.1, 511: 0.0634, 512: 0.05, 515: 0.003, 540: 0.047, 555: 0.0163, 568: 0.0041, 597: 0.0038, 623: 0.0033})

## Gradient Boosted Trees

Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles of decision trees. More information about the spark.ml implementation can be found further in the section on [GBTs.](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts). For more information on the algorithm itself, please see the [spark.mllib documentation on GBTs.](http://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts)

Luckily Spark makes very easy to use, basically just an import switch:

In [12]:
from pyspark.ml.classification import GBTClassifier

# Load and parse the data file

data = spark.read.format('libsvm').load('resources/sample_libsvm_data.txt')

# Split the data into training and test
train_data, test_data = data.randomSplit([0.7, 0.3])

# We will train a GBT model
gbt = GBTClassifier(labelCol='label', featuresCol='features', maxIter=15)

# Train the model on data, this also runs the indexers
model = gbt.fit(train_data)

# Make predictions on test data
predictions = model.transform(test_data)

# Select rows to display
predictions.select('prediction', 'label', 'features').show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       1.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       1.0|  0.0|(692,[154,155,156...|
|       1.0|  1.0|(692,[123,124,125...|
|       1.0|  1.0|(692,[123,124,125...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[125,126,127...|
|       1.0|  1.0|(692,[126,127,128...|
|       1.0|  1.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 20 rows



In [13]:
# Compute accuracy to evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',
                                             metricName='accuracy')

accuracy = evaluator.evaluate(predictions)

print(f'Accuracy: {accuracy}')

Accuracy: 0.9090909090909091
