In [1]:
# đặt biến môi trường 
%env SPARK_LOCAL_HOSTNAME=localhost
#khởi động spark
import findspark
findspark.init()

env: SPARK_LOCAL_HOSTNAME=localhost


In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import scipy
from pyspark.python.pyspark.shell import spark

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/

Using Python version 3.9.7 (default, Sep 16 2021 16:59:28)
Spark context Web UI available at http://localhost:4042
Spark context available as 'sc' (master = local[*], app id = local-1639752180621).
SparkSession available as 'spark'.


In [5]:
data = spark.read.load("C:/Users/Admin/Documents/Absenteeism_at_work.csv", format="csv", header=True, delimiter=";")
data = data.withColumn("MOA", data["Month of absence"] - 0).withColumn("label", data['Height'] - 0). \
    withColumn("ROA", data["Reason for absence"] - 0). \
    withColumn("distance", data["Distance from Residence to Work"] - 0). \
    withColumn("BMI", data["Body mass index"] - 0)
#data.show()

assem = VectorAssembler(inputCols=["label", "distance"], outputCol='features')
data = assem.transform(data)


labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

featureIndexer =\VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


(trainingData, testData) = data.randomSplit([0.7, 0.3])


dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")


pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])


model = pipeline.fit(trainingData)

predictions = model.transform(testData)

predictions.select("prediction", "indexedLabel", "features").show(5)

evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)

y_true = data.select("BMI").rdd.flatMap(lambda x: x).collect()
y_pred = data.select("ROA").rdd.flatMap(lambda x: x).collect()

confusionmatrix = confusion_matrix(y_true, y_pred)

precision = precision_score(y_true, y_pred, average='micro')

recall = recall_score(y_true, y_pred, average='micro')

treeModel = model.stages[2]

print(treeModel)
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

print("The Confusion Matrix for Decision Tree Model is :\n" + str(confusionmatrix))

print("The precision score for Decision Tree Model is: " + str(precision))

print("The recall score for Decision Tree Model is: " + str(recall))

+----------+------------+------------+
|prediction|indexedLabel|    features|
+----------+------------+------------+
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,11.0]|
|       1.0|         1.0|[172.0,52.0]|
|       1.0|         1.0|[172.0,52.0]|
|       1.0|         1.0|[172.0,52.0]|
+----------+------------+------------+
only showing top 5 rows

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3033818ce77f, depth=5, numNodes=25, numClasses=14, numFeatures=2
Decision Tree - Test Accuracy = 0.995327
Decision Tree - Test Error = 0.0046729
The Confusion Matrix for Decision Tree Model is :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [5 0 0 ... 0 0 0]]
The precision score for Decision Tree Model is: 0.02972972972972973
The recall score for Decision Tree Model is: 0.02972972972972973
