In [1]:
# đặt biến môi trường 
%env SPARK_LOCAL_HOSTNAME=localhost

env: SPARK_LOCAL_HOSTNAME=localhost


In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import scipy
from pyspark.python.pyspark.shell import spark

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/

Using Python version 3.9.7 (default, Sep 16 2021 16:59:28)
Spark context Web UI available at http://localhost:4040
Spark context available as 'sc' (master = local[*], app id = local-1642964313358).
SparkSession available as 'spark'.


In [4]:
df = spark.read.load("C:/Users/Admin/Documents/mushrooms.csv", format="csv", header=True, delimiter=",")
df

DataFrame[class: string, cap-shape: string, cap-surface: string, cap-color: string, bruises: string, odor: string, gill-attachment: string, gill-spacing: string, gill-size: string, gill-color: string, stalk-shape: string, stalk-root: string, stalk-surface-above-ring: string, stalk-surface-below-ring: string, stalk-color-above-ring: string, stalk-color-below-ring: string, veil-type: string, veil-color: string, ring-number: string, ring-type: string, spore-print-color: string, population: string, habitat: string]

In [5]:
df.show()

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in df.columns ]

stages = indexers
pipeline = Pipeline().setStages(stages)
df_r = pipeline.fit(df).transform(df)

clolumnsToDrop = ['class','cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']
df_r = df_r.drop(*clolumnsToDrop)
df_r.show()

+-----------+---------------+-----------------+---------------+-------------+----------+---------------------+------------------+---------------+----------------+-----------------+----------------+------------------------------+------------------------------+----------------------------+----------------------------+---------------+----------------+-----------------+---------------+-----------------------+----------------+-------------+
|class_index|cap-shape_index|cap-surface_index|cap-color_index|bruises_index|odor_index|gill-attachment_index|gill-spacing_index|gill-size_index|gill-color_index|stalk-shape_index|stalk-root_index|stalk-surface-above-ring_index|stalk-surface-below-ring_index|stalk-color-above-ring_index|stalk-color-below-ring_index|veil-type_index|veil-color_index|ring-number_index|ring-type_index|spore-print-color_index|population_index|habitat_index|
+-----------+---------------+-----------------+---------------+-------------+----------+---------------------+----------

In [7]:
assem = VectorAssembler(inputCols=["class_index", "odor_index"], outputCol='features')
df_r = assem.transform(df_r)

In [8]:
labelIndexer = StringIndexer(inputCol="class_index", outputCol="indexedLabel").fit(df_r)

featureIndexer =\
                VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df_r)

In [9]:
(trainingData, testData) = df_r.randomSplit([0.8, 0.2])

In [10]:
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")


pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])


model = pipeline.fit(trainingData)

predictions = model.transform(testData)

predictions.select("prediction", "indexedLabel", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)

y_true = df_r.select("gill-color_index").rdd.flatMap(lambda x: x).collect()
y_pred = df_r.select("cap-color_index").rdd.flatMap(lambda x: x).collect()

confusionmatrix = confusion_matrix(y_true, y_pred)

precision = precision_score(y_true, y_pred, average='micro')

recall = recall_score(y_true, y_pred, average='micro')

treeModel = model.stages[2]

print(treeModel)
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

print("The Confusion Matrix for Decision Tree Model is :\n" + str(confusionmatrix))

print("The precision score for Decision Tree Model is: " + str(precision))

print("The recall score for Decision Tree Model is: " + str(recall))

+----------+------------+---------+
|prediction|indexedLabel| features|
+----------+------------+---------+
|       0.0|         0.0|(2,[],[])|
|       0.0|         0.0|(2,[],[])|
|       0.0|         0.0|(2,[],[])|
|       0.0|         0.0|(2,[],[])|
|       0.0|         0.0|(2,[],[])|
+----------+------------+---------+
only showing top 5 rows

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_e37a3b2257eb, depth=1, numNodes=3, numClasses=2, numFeatures=2
Decision Tree - Test Accuracy = 1
Decision Tree - Test Error = 0
The Confusion Matrix for Decision Tree Model is :
[[864   0 864   0   0   0   0   0   0   0   0   0]
 [284 532 144 264 212  32  16   0   4   4   0   0]
 [302 232 174 132 212  64  40  38   4   4   0   0]
 [348 236 144 112 192   0  16   0   0   0   0   0]
 [ 12 292   0 280 136   8  24   0   0   0   0   0]
 [ 64 312   0 216 100  32   0   0   4   4   0   0]
 [144 160 144   0  20   0  16   0   4   4   0   0]
 [108  76   0  64 160   0   0   0   0   0   0   0]
 [ 24