In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("DecisionTreeRegression").getOrCreate()

In [3]:
df = spark.read.csv('vgsales.csv',inferSchema=True,header='true')

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['NA_Sales','EU_Sales','JP_Sales'],
                            outputCol='features')
data = assembler.transform(df)
data = data.withColumnRenamed("Global_Sales","label")

In [7]:
data.select('features','label').show()

+------------------+-----+
|          features|label|
+------------------+-----+
|[41.49,29.02,3.77]|82.74|
| [29.08,3.58,6.81]|40.24|
|[15.85,12.88,3.79]|35.82|
|[15.75,11.01,3.28]| 33.0|
|[11.27,8.89,10.22]|31.37|
|  [23.2,2.26,4.22]|30.26|
|  [11.38,9.23,6.5]|30.01|
|  [14.03,9.2,2.93]|29.02|
|  [14.59,7.06,4.7]|28.62|
| [26.93,0.63,0.28]|28.31|
|  [9.07,11.0,1.93]|24.76|
|  [9.81,7.57,4.13]|23.42|
|    [9.0,6.18,7.2]| 23.1|
|   [8.94,8.03,3.6]|22.72|
|  [9.09,8.59,2.53]| 22.0|
| [14.97,4.94,0.24]|21.82|
|  [7.01,9.27,0.97]| 21.4|
|   [9.43,0.4,0.41]|20.81|
| [12.78,3.75,3.54]|20.61|
|  [4.75,9.26,4.16]|20.22|
+------------------+-----+
only showing top 20 rows



In [8]:
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4,handleInvalid='skip').fit(data)

In [9]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [10]:
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [11]:
pipeline = Pipeline(stages=[featureIndexer, dt])

In [12]:
model = pipeline.fit(trainingData)

In [13]:
predictions = model.transform(testData)

In [14]:
predictions.select("prediction", "label", "features").toPandas()

Unnamed: 0,prediction,label,features
0,11.929872,82.74,"[41.49, 29.02, 3.77]"
1,4.052692,20.81,"[9.43, 0.4, 0.41]"
2,11.929872,20.22,"[4.75, 9.26, 4.16]"
3,11.929872,18.36,"[6.42, 4.52, 6.04]"
4,11.929872,17.28,"[9.54, 3.44, 3.84]"
...,...,...,...
5012,0.097888,0.01,"[0.0, 0.0, 0.01]"
5013,0.097888,0.01,"[0.0, 0.01, 0.0]"
5014,0.097888,0.01,"[0.0, 0.0, 0.01]"
5015,0.097888,0.01,"[0.01, 0.0, 0.0]"


In [15]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.21009


In [14]:
treeModel = model.stages[1]

In [16]:
dir(treeModel)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_call_java',
 '_copyValues',
 '_copy_params',
 '_create_from_java_class',
 '_create_params_from_java',
 '_defaultParamMap',
 '_dummy',
 '_empty_java_param_map',
 '_from_java',
 '_is_protocol',
 '_java_obj',
 '_make_java_param_pair',
 '_new_java_array',
 '_new_java_obj',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_testOwnParam',
 '_to_java',
 '_transfer_param_map_from_java',
 '_transfer_param_