In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dTrees').getOrCreate()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, 
                                       GBTClassifier, 
                                       DecisionTreeClassifier)
from pyspark.ml.regression import (RandomForestRegressor,
                                   GBTRegressor, 
                                   DecisionTreeRegressor)


In [4]:
file_name = '/sample_libsvm_data.txt'
data_folder = data_folder = '/home/ubuntu/data/raw'
data = spark.read.format('libsvm').load(data_folder+file_name)

In [6]:
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [7]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [16]:
#Initilize Model
dt_clf = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label',maxDepth=2)
rf_clf = RandomForestClassifier(featuresCol='features',labelCol='label',
                                numTrees=100)
gbt_clf = GBTClassifier()

In [19]:
#Fit Models
dt_model = dt_clf.fit(train_data)
rf_model = rf_clf.fit(train_data)
gbt_model = gbt_clf.fit(train_data)

In [27]:
#Evaluation with Test Data
dt_preds = dt_model.transform(test_data)
rf_preds = rf_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [28]:
dt_preds.show(3)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[100,101,102...|   [23.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [23.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 3 rows



In [29]:
#Evaluations (you can always use Multi Class for )
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
#Create a multiclass cliassifier to accuracy
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [38]:
print('DT Accuracy', acc_eval.evaluate(dt_preds))

print('RF Accuracy', acc_eval.evaluate(rf_preds))

print('GBT Accuracy', acc_eval.evaluate(gbt_preds))

DT Accuracy 0.9090909090909091
RF Accuracy 1.0
GBT Accuracy 0.9090909090909091


In [39]:
#Feature Importance
rf_model.featureImportances

SparseVector(692, {120: 0.0006, 121: 0.0006, 131: 0.0005, 154: 0.0008, 174: 0.0018, 177: 0.0011, 182: 0.0006, 184: 0.0016, 189: 0.0017, 208: 0.0015, 215: 0.0009, 216: 0.0009, 235: 0.0075, 236: 0.0004, 243: 0.0065, 244: 0.0092, 261: 0.0007, 262: 0.0075, 271: 0.015, 272: 0.0134, 287: 0.0066, 291: 0.0008, 295: 0.0094, 299: 0.0013, 300: 0.0479, 301: 0.0108, 313: 0.0005, 314: 0.003, 317: 0.0077, 318: 0.0038, 319: 0.0006, 322: 0.0012, 323: 0.0087, 327: 0.0071, 329: 0.0033, 330: 0.007, 331: 0.0046, 345: 0.0085, 347: 0.0007, 350: 0.0208, 351: 0.0302, 352: 0.0006, 353: 0.002, 354: 0.0014, 356: 0.0255, 359: 0.0043, 369: 0.0028, 371: 0.0004, 372: 0.0062, 373: 0.0082, 378: 0.0506, 379: 0.0007, 380: 0.0016, 385: 0.0174, 398: 0.004, 400: 0.0084, 401: 0.0062, 402: 0.0021, 405: 0.0263, 406: 0.0324, 407: 0.0232, 409: 0.0006, 410: 0.0011, 411: 0.0003, 412: 0.0006, 413: 0.0068, 414: 0.0053, 415: 0.0035, 425: 0.0008, 426: 0.0007, 428: 0.0132, 431: 0.0009, 432: 0.0007, 433: 0.048, 434: 0.0087, 435: 0.019, 