In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType
from pyspark.ml.linalg import Vectors

In [None]:
spark = SparkSession.builder.appName("DecisionTreeRegression").getOrCreate()

In [None]:
schema = StructType([
        StructField('Rank', IntegerType(), True),
        StructField('Name', StringType(), True),
        StructField('Platform', StringType(), True),
        StructField('Year', IntegerType(), True),
    StructField('Genre', StringType(), True),
    StructField('Publisher', StringType(), True),
    StructField('NA_Sales', DoubleType(), True),
    StructField('EU_Sales', DoubleType(), True),
    StructField('JP_Sales', DoubleType(), True),
    StructField('Other_Sales', DoubleType(), True),
    StructField('Global_Sales', DoubleType(), True)
        ])

In [None]:
df = spark.read.csv('vgsales.csv',inferSchema=True,header='true')

In [None]:
data=df.rdd.map(lambda x:(Vectors.dense(x[6],x[7],x[8]), x[9])).toDF(["features", "label"])

In [None]:
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4,handleInvalid='skip').fit(data)

In [None]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [None]:
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [None]:
pipeline = Pipeline(stages=[featureIndexer, dt])

In [None]:
model = pipeline.fit(trainingData)

In [None]:
predictions = model.transform(testData)

In [None]:
predictions.select("prediction", "label", "features").show(5)

In [None]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
treeModel = model.stages[1]

In [None]:
print(treeModel)