In [8]:
from sklearn.datasets import load_iris
import pyspark.pandas as ps
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

# spark = SparkSession.builder.getOrCreate()
# sc = pyspark.SparkContext.builder.(master="spark://127.0.0.1:7077", appName="maps_and_lazy_evaluation_example")
spark = SparkSession.builder.appName("Iris").getOrCreate()
sc = pyspark.SparkConf()
spark.version

'3.3.0'

In [18]:
X, y = load_iris(return_X_y=True, as_frame= True)
df = X
df['target'] = y
df = spark.createDataFrame(df)
df.show(10)

+-----------------+----------------+-----------------+----------------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+-----------------+----------------+-----------------+----------------+------+
|              5.1|             3.5|              1.4|             0.2|     0|
|              4.9|             3.0|              1.4|             0.2|     0|
|              4.7|             3.2|              1.3|             0.2|     0|
|              4.6|             3.1|              1.5|             0.2|     0|
|              5.0|             3.6|              1.4|             0.2|     0|
|              5.4|             3.9|              1.7|             0.4|     0|
|              4.6|             3.4|              1.4|             0.3|     0|
|              5.0|             3.4|              1.5|             0.2|     0|
|              4.4|             2.9|              1.4|             0.2|     0|
|              4.9|             3.1|              1.

In [22]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = pyspark.ml.feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(df)
data.show(5)

+-----------------+----------------+-----------------+----------------+------+-----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|         features|
+-----------------+----------------+-----------------+----------------+------+-----------------+
|              5.1|             3.5|              1.4|             0.2|     0|[5.1,3.5,1.4,0.2]|
|              4.9|             3.0|              1.4|             0.2|     0|[4.9,3.0,1.4,0.2]|
|              4.7|             3.2|              1.3|             0.2|     0|[4.7,3.2,1.3,0.2]|
|              4.6|             3.1|              1.5|             0.2|     0|[4.6,3.1,1.5,0.2]|
|              5.0|             3.6|              1.4|             0.2|     0|[5.0,3.6,1.4,0.2]|
+-----------------+----------------+-----------------+----------------+------+-----------------+
only showing top 5 rows



In [23]:
# convert text labels into indices
data = data.select(['features', 'target'])
label_indexer = pyspark.ml.feature.StringIndexer(inputCol='target', outputCol='label').fit(data)
data = label_indexer.transform(data)
data.show(5)


+-----------------+------+-----+
|         features|target|label|
+-----------------+------+-----+
|[5.1,3.5,1.4,0.2]|     0|  0.0|
|[4.9,3.0,1.4,0.2]|     0|  0.0|
|[4.7,3.2,1.3,0.2]|     0|  0.0|
|[4.6,3.1,1.5,0.2]|     0|  0.0|
|[5.0,3.6,1.4,0.2]|     0|  0.0|
+-----------------+------+-----+
only showing top 5 rows



In [27]:
# only select the features and label column
data = data.select(['features', 'label'])
print("Reading for machine learning")
data.show(5)

Reading for machine learning
+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [28]:
train, test = data.randomSplit([0.70, 0.30])

In [37]:
from pyspark.ml.classification import LogisticRegression

pipe = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

pipe = pipe.fit(train)

In [42]:
prediction = pipe.transform(test)
prediction

DataFrame[features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [41]:
evaluator = pyspark.ml.evaluation.MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
accuracy

0.5813953488372093