## Random Forest Classification

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rf').getOrCreate()

In [0]:
data = spark.read.format('libsvm').load('dbfs:/FileStore/sample_libsvm_data.txt')

In [0]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [0]:
data.head()

Row(label=0.0, features=SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0,

In [0]:
# spli data train and testing
train, test = data.randomSplit([0.7, 0.3])

In [0]:
train.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
# training the model
rf = RandomForestClassifier(featuresCol='features',labelCol='label', numTrees= 20)

In [0]:
# training model
modelrf = rf.fit(train)

In [0]:
# make prediction
predictions = modelrf.transform(test)

In [0]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
predictions.select('prediction','label','features').show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
+----------+-----+--------------------+
only showing top 5 rows



In [0]:
evalRf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')

In [0]:
# test error
accuracy = evalRf.evaluate(predictions)
print('Test error : {:.2f}'.format(1.0  - accuracy))

Test error : 0.00


## Gradien Boosting Tree 

Gradient Boosting Trees (GBTs) are popular classification and regression using ensembles of dicision trees. More information about the spark.ml implentation can be found further in the section [GBTs](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-trees-gbts) For information on algorithm itself, please see the [spark.mlib documentaion GBTs](https://spark.apache.org/docs/latest/mllib-ensembles.html#gradient-boosted-trees-gbts)

In [0]:
from pyspark.ml.classification import GBTClassifier
#read dataset
df = spark.read.format('libsvm').load('dbfs:/FileStore/sample_libsvm_data.txt')

# split data train and testing
train, test = df.randomSplit([0.7, 0.3])

# GBTClassifier model
gbt = GBTClassifier(featuresCol='features', labelCol='label', maxIter=10)

# training model
gbtModel = gbt.fit(train)

# predictions model
predictions = gbtModel.transform(test)

predictions.select('prediction','label','features').show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [0]:
# evaluation
evalgbt = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
accuracy = evalgbt.evaluate(predictions)
print('Test Error : {:.2f}'.format(1.0 - accuracy))

Test Error : 0.06
