In [2]:
from __future__ import print_function
print(sc)

<SparkContext master=local[*] appName=PySparkShell>


In [14]:
CSV_PATH = "../data/train_spark.csv"
APP_NAME = "Random Forest Example"
SPARK_URL = "local[*]"
RANDOM_SEED = 13579
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 3
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [10]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()

df = spark.read \
    .options(header = "true", inferschema = "true") \
    .csv(CSV_PATH)

print("Total number of rows: %d" % df.count())

Total number of rows: 4564


In [11]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

print("Number of training set rows: %d" % training_data.count())
print("Number of test set rows: %d" % test_data.count())

Number of training set rows: 3234
Number of test set rows: 1330


In [17]:
from pyspark.mllib.tree import RandomForest
from time import *

start_time = time()

model = RandomForest.trainRegressor(training_data, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="variance", \
    maxDepth=RF_MAX_DEPTH, maxBins=RF_NUM_BINS, seed=RANDOM_SEED)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

Time to train model: 1.942 seconds


In [20]:
predictions = model.predict(test_data.map(lambda x: x.features))
labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
    float(test_data.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())

Test Mean Squared Error = 0.00151845009695
Learned regression forest model:
TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 3 <= 1.036697247706422)
     If (feature 11 <= 110.7575)
      If (feature 1 <= 2.0)
       If (feature 3 <= 0.7931034482758621)
        Predict: 0.0011395890554280245
       Else (feature 3 > 0.7931034482758621)
        Predict: -0.005341331557936258
      Else (feature 1 > 2.0)
       If (feature 0 <= 4288.0)
        Predict: 0.003110957572336843
       Else (feature 0 > 4288.0)
        Predict: 0.04766924369779765
     Else (feature 11 > 110.7575)
      If (feature 9 <= 1.01644704E9)
       If (feature 7 <= 20.11)
        Predict: 0.06407683030820599
       Else (feature 7 > 20.11)
        Predict: 0.039259640196933177
      Else (feature 9 > 1.01644704E9)
       If (feature 2 <= 12.0)
        Predict: -0.009074877910198629
       Else (feature 2 > 12.0)
        Predict: -0.03492497894175619
    Else (feature 3 > 1.036697247706422)
     If (