In [2]:
import findspark
findspark.init('/home/mint/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree').getOrCreate()
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
train_data, test_data = data.randomSplit([0.7, 0.3])

In [13]:
dtc = DecisionTreeClassifier()
dtc_model = dtc.fit(train_data)
dtc_preds = dtc_model.transform(test_data)

gbt = GBTClassifier()
gbt_model = gbt.fit(train_data)
gbt_preds = gbt_model.transform(test_data)

rfc = RandomForestClassifier(numTrees=100)
rfc_model = rfc.fit(train_data)
rfc_preds = rfc_model.transform(test_data)

In [7]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
print("Decision Tree Accuracy:")
print(acc_eval.evaluate(dtc_preds))
print("Random Forest Accuracy:")
print(acc_eval.evaluate(rfc_preds))
print("Gradient Boosting Accuracy:")
print(acc_eval.evaluate(gbt_preds))

Decision Tree Accuracy:
0.9655172413793104
Random Forest Accuracy:
1.0
Gradient Boosting Accuracy:
0.9655172413793104


In [14]:
rfc_model.featureImportances

SparseVector(692, {131: 0.0008, 183: 0.0019, 185: 0.0006, 186: 0.0005, 213: 0.0001, 215: 0.0008, 234: 0.0008, 262: 0.0093, 263: 0.0133, 264: 0.0004, 271: 0.0128, 272: 0.0082, 290: 0.0076, 291: 0.0005, 295: 0.002, 302: 0.0071, 314: 0.0037, 316: 0.0009, 318: 0.0015, 322: 0.0088, 323: 0.0044, 326: 0.0003, 327: 0.0009, 328: 0.007, 342: 0.0011, 351: 0.042, 370: 0.0008, 372: 0.0073, 373: 0.0157, 375: 0.0012, 377: 0.0039, 378: 0.0477, 379: 0.0018, 385: 0.0062, 388: 0.0014, 398: 0.0006, 402: 0.0014, 404: 0.0006, 405: 0.0034, 406: 0.0351, 407: 0.0253, 410: 0.0006, 411: 0.001, 412: 0.0107, 414: 0.0062, 415: 0.0023, 426: 0.0044, 427: 0.0088, 429: 0.0163, 431: 0.0005, 432: 0.0086, 433: 0.047, 434: 0.02, 435: 0.0224, 438: 0.0004, 439: 0.0006, 441: 0.0016, 442: 0.0015, 444: 0.0027, 454: 0.0025, 455: 0.0245, 456: 0.0022, 460: 0.0015, 461: 0.0297, 462: 0.0327, 463: 0.0005, 468: 0.0071, 472: 0.003, 481: 0.001, 482: 0.0104, 483: 0.0597, 488: 0.0032, 489: 0.0424, 490: 0.0718, 491: 0.0007, 495: 0.0116, 49