In [None]:
import pandas as pd
import seaborn as sns
from pyspark.sql import SparkSession

iris = sns.load_dataset('iris')
df = iris.drop(['sepal_length', 'sepal_width'], axis=1)
X_labels = ['petal_length', 'petal_width']
Y_label = 'species'

# SparkSessionインスタンスを生成
spark = SparkSession.builder.appName("RandomForest-Iris").getOrCreate()
# Spark DataFrame の作成
sdf = spark.createDataFrame(pd.DataFrame(df))

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

# Speciesを数値に変換
labelIndexer = StringIndexer(inputCol=Y_label, outputCol='label').fit(sdf)
# feature vector への変換
featureAssembler = VectorAssembler(inputCols=X_labels, outputCol='features')
# RandomForest
classifier = RandomForestClassifier(labelCol='label', featuresCol='features')
# Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureAssembler, classifier])

In [None]:
model = pipeline.fit(sdf)
predictions = model.transform(sdf)
predictions.show(5)

In [None]:
df = predictions.select('*').toPandas()
df

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print('正確度={:.3f}, 適合度={:.3f}, 再現率={:.3f}'.format(
    accuracy_score(df.label, df.prediction),
    precision_score(df.label, df.prediction, average='micro'),
    recall_score(df.label, df.prediction, average='micro')))

In [None]:
precision_score(df.label, df.prediction, average=None)

In [None]:
recall_score(df.label, df.prediction, average=None)