In [0]:
sc

In [0]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_pandas_df = pd.DataFrame(iris.data)
iris_pandas_df['label'] = iris.target
iris_pandas_df.columns = [
    'sepal_len',
    'sepal_wid',
    'petal_len',
    'petal_wid',
    'label'
]

df = spark.createDataFrame(iris_pandas_df)
display(df)

sepal_len,sepal_wid,petal_len,petal_wid,label
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
4.7,3.2,1.3,0.2,0
4.6,3.1,1.5,0.2,0
5.0,3.6,1.4,0.2,0
5.4,3.9,1.7,0.4,0
4.6,3.4,1.4,0.3,0
5.0,3.4,1.5,0.2,0
4.4,2.9,1.4,0.2,0
4.9,3.1,1.5,0.1,0


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols = df.columns[:-1],
    outputCol = "features"
)

df = assembler.transform(df)

df[['features']].head()

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

df_train, df_test = df.randomSplit([0.7, 0.3], seed=42)

classifier = DecisionTreeClassifier(
    labelCol="label",
    featuresCol="features"
)

model = classifier.fit(df_train)
df_test = model.transform(df_test)

display(df_test)

sepal_len,sepal_wid,petal_len,petal_wid,label,features,rawPrediction,probability,prediction
4.6,3.1,1.5,0.2,0,"List(1, 4, List(), List(4.6, 3.1, 1.5, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
4.8,3.4,1.6,0.2,0,"List(1, 4, List(), List(4.8, 3.4, 1.6, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
4.9,3.1,1.5,0.1,0,"List(1, 4, List(), List(4.9, 3.1, 1.5, 0.1))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.0,3.4,1.5,0.2,0,"List(1, 4, List(), List(5.0, 3.4, 1.5, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.4,3.7,1.5,0.2,0,"List(1, 4, List(), List(5.4, 3.7, 1.5, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.4,3.9,1.3,0.4,0,"List(1, 4, List(), List(5.4, 3.9, 1.3, 0.4))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.4,3.9,1.7,0.4,0,"List(1, 4, List(), List(5.4, 3.9, 1.7, 0.4))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
4.6,3.6,1.0,0.2,0,"List(1, 4, List(), List(4.6, 3.6, 1.0, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.0,3.0,1.6,0.2,0,"List(1, 4, List(), List(5.0, 3.0, 1.6, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0
5.0,3.2,1.2,0.2,0,"List(1, 4, List(), List(5.0, 3.2, 1.2, 0.2))","List(1, 3, List(), List(28.0, 0.0, 0.0))","List(1, 3, List(), List(1.0, 0.0, 0.0))",0.0


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='label',
    predictionCol='prediction',
    metricName='accuracy'
)

evaluator.evaluate(df_test)

In [0]:

y_true = df_test.select(['label']).collect()
y_pred = df_test.select(['prediction']).collect()

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred))