In [7]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()

# Load the iris dataset
irisData = spark.read.csv("iris.csv", header=True, inferSchema=True)

# Convert the "species" column to a numeric label
labelIndexer = StringIndexer(inputCol="species", outputCol="label")
indexedData = labelIndexer.fit(irisData).transform(irisData)

# Prepare the feature vector
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
assembledData = assembler.transform(indexedData)

# Split the data into training and testing sets
(trainData, testData) = assembledData.randomSplit([0.8, 0.2], seed=42)

# Initialize the decision tree classifier
dtClassifier = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# Train the model on the training set
model = dtClassifier.fit(trainData)

# Make predictions on the testing set
predictions = model.transform(testData)

# Evaluate the model's accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions) * 100

# Display the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

# Stop the Spark session
spark.stop()


Accuracy: 100.00%
