In [20]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn import datasets

In [3]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [4]:
spark

In [5]:
iris = datasets.load_iris()

In [12]:
df = spark.createDataFrame(
    [(int(y), Vectors.dense(x)) for y, x in zip(iris.target, iris.data)],
    ['label', 'features']
)

In [13]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [26]:
rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=10)

In [27]:
model = rf.fit(train_df)

In [28]:
predicts = model.transform(test_df)

In [29]:
predicts.show()

+-----+-----------------+---------------+-----------------+----------+
|label|         features|  rawPrediction|      probability|prediction|
+-----+-----------------+---------------+-----------------+----------+
|    0|[4.7,3.2,1.6,0.2]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[4.8,3.0,1.4,0.1]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.0,3.0,1.6,0.2]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.0,3.4,1.6,0.4]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.1,3.5,1.4,0.2]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.1,3.7,1.5,0.4]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.1,3.8,1.5,0.3]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.2,3.4,1.4,0.2]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.2,3.5,1.5,0.2]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.2,4.1,1.5,0.1]| [10.0,0.0,0.0]|    [1.0,0.0,0.0]|       0.0|
|    0|[5.7,3.8,1.7,0.3]|  [8.0,2.0,0.0]|    [0.8,0.2,0.0]|       0.0|
|    1

In [30]:
evaluater = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')

In [33]:
accuracy = evaluater.evaluate(predicts)
accuracy

1.0