### Set up spark context and SparkSession

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark RandomForest Regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/10/25 20:35:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/25 20:35:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
df = spark.read.format('csv').\
                               options(header='true', \
                               inferschema='true').\
                               load("data/Advertising.csv",header=True);

df.show(5,True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [3]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



### Convert the data to dense vector (features and label)

In [5]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
transformed=df.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])
transformed.show(5)

                                                                                

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



### Deal with the Categorical variables, even they are numeric, if a feature columns has no more than 4 distinct values, it will be considered categorical and will be indexed to improve training model.

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)
data.show(5,True)

                                                                                

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]|  9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows



In [8]:
# Import LinearRegression class
from pyspark.ml.regression import RandomForestRegressor

# Define LinearRegression algorithm
rf = RandomForestRegressor(featuresCol="indexedFeatures") # featuresCol="indexedFeatures",numTrees=2, maxDepth=2, seed=42


### Split the data into training and test sets (40% held out for testing)

In [9]:
(trainingData, testData) = data.randomSplit([0.6, 0.4])

trainingData.show(5)
testData.show(5)

[Stage 13:>                                                         (0 + 1) / 1]

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [4.1,11.6,5.7]|  3.2|  [4.1,11.6,5.7]|
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.3,28.1,41.4]|  5.5| [7.3,28.1,41.4]|
|   [8.6,2.1,1.0]|  4.8|   [8.6,2.1,1.0]|
|[11.7,36.9,45.2]|  7.3|[11.7,36.9,45.2]|
+----------------+-----+----------------+
only showing top 5 rows

+---------------+-----+---------------+
|       features|label|indexedFeatures|
+---------------+-----+---------------+
| [0.7,39.6,8.7]|  1.6| [0.7,39.6,8.7]|
|[7.8,38.9,50.6]|  6.6|[7.8,38.9,50.6]|
| [8.4,27.2,2.1]|  5.7| [8.4,27.2,2.1]|
|[8.7,48.9,75.0]|  7.2|[8.7,48.9,75.0]|
|[13.1,0.4,25.6]|  5.3|[13.1,0.4,25.6]|
+---------------+-----+---------------+
only showing top 5 rows



                                                                                

### Fit RandomForest Regression Model

If you decide to use the indexedFeatures features, you need to add the parameter featuresCol="indexedFeatures".

Pipeline Architecture

In [10]:
pipeline = Pipeline(stages=[rf])
model = pipeline.fit(trainingData)

                                                                                

### Make test predictions with testdata

In [12]:
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("indexedFeatures","label", "prediction").show(5)

+---------------+-----+------------------+
|indexedFeatures|label|        prediction|
+---------------+-----+------------------+
| [0.7,39.6,8.7]|  1.6| 9.517884615384617|
|[7.8,38.9,50.6]|  6.6|10.795416415284063|
| [8.4,27.2,2.1]|  5.7|  8.13717948717949|
|[8.7,48.9,75.0]|  7.2|12.157750000000002|
|[13.1,0.4,25.6]|  5.3|6.7906031746031745|
+---------------+-----+------------------+
only showing top 5 rows



### Evaluation

In [13]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 1.95272


In [15]:
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.852397


In [16]:
type(model.stages[-1])

pyspark.ml.regression.RandomForestRegressionModel

In [18]:
model.stages[-1].featureImportances

SparseVector(3, {0: 0.5301, 1: 0.3361, 2: 0.1338})

### Feature importances, there are 3 features, index 0 is the root had highest feature importance value, 0.4736, .... This means, advertising on TV is most important feature

### Show all decision trees in the random forest

In [19]:
model.stages[-1].trees

[DecisionTreeRegressionModel: uid=dtr_1656e6241479, depth=5, numNodes=39, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_06aa52795b27, depth=5, numNodes=53, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_69455921b9e5, depth=5, numNodes=49, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_9253e47e697f, depth=5, numNodes=47, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_a514fba35635, depth=5, numNodes=45, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_dd8e737a09ef, depth=5, numNodes=45, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_47564fe881a6, depth=5, numNodes=47, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_6c6f14fcd965, depth=5, numNodes=49, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_64ca61f7fbfe, depth=5, numNodes=43, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_a60e12dd6f5b, depth=5, numNodes=41, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_4e7b948bb38d, depth=5, numNodes=55, numFeatures=3,
 DecisionTreeRegressi

In [20]:
rf_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % rf_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.852397


                                                                                

In [21]:
rf_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="rmse")
print("(RMSE) on test data = %g" % rf_evaluator.evaluate(predictions))

(RMSE) on test data = 1.95272
