# Random Forest Example

quick walkthrough of the Documentation's Example of Random Forest:

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (DecisionTreeClassifier, 
                                       RandomForestClassifier, 
                                       GBTClassifier)

In [4]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [5]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                100|
|   mean|               0.57|
| stddev|0.49756985195624287|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



## Gradient Boosted Trees

Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles of decision trees. More information about the spark.ml implementation can be found further in the section on [GBTs.](http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts). For more information on the algorithm itself, please see the [spark.mllib documentation on GBTs.](http://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts)

In [8]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')
rfc = RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', numTrees=100)
gbc = GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [9]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [10]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbc_model = gbc.fit(train_data)

In [11]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbc_preds = gbc_model.transform(test_data)

In [12]:
dtc_preds.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')

In [15]:
print('DTC Accuracy: ')
acc_eval.evaluate(dtc_preds)

DTC Accuracy: 


0.9618338650596716

In [16]:
print('Rfc Accuracy: ')
acc_eval.evaluate(rfc_preds)

Rfc Accuracy: 


1.0

In [17]:
print('GBC Accuracy: ')
acc_eval.evaluate(gbc_preds)

GBC Accuracy: 


0.9618338650596716

In [18]:
rfc_model.featureImportances

SparseVector(692, {119: 0.0005, 157: 0.0017, 176: 0.0005, 178: 0.0005, 180: 0.0012, 184: 0.0005, 186: 0.0005, 207: 0.0053, 218: 0.0006, 234: 0.0097, 237: 0.0005, 240: 0.0015, 242: 0.0007, 243: 0.0005, 244: 0.0001, 260: 0.0005, 262: 0.0111, 263: 0.0129, 266: 0.0005, 272: 0.0089, 273: 0.0072, 274: 0.0078, 290: 0.0075, 295: 0.0069, 299: 0.0026, 301: 0.0024, 302: 0.0006, 317: 0.0143, 318: 0.0071, 322: 0.0001, 323: 0.0145, 324: 0.0011, 326: 0.0013, 331: 0.0006, 344: 0.0075, 350: 0.0659, 351: 0.0188, 352: 0.0005, 356: 0.0078, 372: 0.001, 373: 0.0092, 374: 0.0079, 375: 0.0015, 377: 0.0247, 378: 0.0325, 380: 0.0016, 384: 0.0071, 385: 0.0055, 387: 0.0014, 398: 0.002, 399: 0.0024, 400: 0.0108, 401: 0.0026, 405: 0.0605, 406: 0.0184, 407: 0.0225, 408: 0.006, 409: 0.0009, 427: 0.0118, 428: 0.0076, 429: 0.0101, 432: 0.0016, 433: 0.0637, 434: 0.0137, 435: 0.01, 440: 0.0109, 442: 0.0005, 455: 0.0243, 456: 0.0433, 457: 0.001, 458: 0.0005, 460: 0.0164, 461: 0.0385, 462: 0.0186, 463: 0.0169, 464: 0.001, 

# Gradient Boosted Trees

In [20]:
# load data
my_data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

# Split data
train_Data, test_Data = my_data.randomSplit([0.7,0.3])

# Train a GBT model
gbt = GBTClassifier(labelCol='label', featuresCol='features', maxIter=10)

# Train model, this also runs the indexers
model = gbt.fit(train_Data)

# Make a prediction
predictions = model.transform(test_Data)

# Select example row to display
predictions.select(['prediction', 'label', 'features']).show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [22]:
# Select (prediction, true label) and comput test error
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Test Error = %g' % (1.0 - accuracy))

Test Error = 0.037037


So this data isn't really realistic enough to really judge to effectiveness of GBT models, this data makes it seem like they are perfection, instead of just an improvement on normal Random Forests.