In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from pyspark.python.pyspark.shell import spark

import scipy
import os
import numpy as np

from pyspark.ml.linalg import SparseVector
from pyspark.sql import SparkSession

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.3
      /_/

Using Python version 3.8.8 (default, Apr 13 2021 15:08:03)
Spark context Web UI available at http://LAPTOP-1IU0R79J:4040
Spark context available as 'sc' (master = local[*], app id = local-1651910685912).
SparkSession available as 'spark'.


In [3]:
import pandas as pd
pd.read_csv("Absenteeism_at_work.csv", header=0, delimiter=";").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               740 non-null    int64  
 1   Reason for absence               740 non-null    int64  
 2   Month of absence                 740 non-null    int64  
 3   Day of the week                  740 non-null    int64  
 4   Seasons                          740 non-null    int64  
 5   Transportation expense           740 non-null    int64  
 6   Distance from Residence to Work  740 non-null    int64  
 7   Service time                     740 non-null    int64  
 8   Age                              740 non-null    int64  
 9   Work load Average/day            740 non-null    float64
 10  Hit target                       740 non-null    int64  
 11  Disciplinary failure             740 non-null    int64  
 12  Education             

In [4]:
# Đọc dữ liệu và tạo ra các cột mới
data_df = spark.read.load("Absenteeism_at_work.csv", format="csv", header=True, delimiter=";",inferSchema =True)
data_df = data_df.withColumnRenamed('Absenteeism time in hours','label')
data_df.limit(5).toPandas()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,label
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,2


In [5]:
data_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Reason for absence: integer (nullable = true)
 |-- Month of absence: integer (nullable = true)
 |-- Day of the week: integer (nullable = true)
 |-- Seasons: integer (nullable = true)
 |-- Transportation expense: integer (nullable = true)
 |-- Distance from Residence to Work: integer (nullable = true)
 |-- Service time: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Work load Average/day : double (nullable = true)
 |-- Hit target: integer (nullable = true)
 |-- Disciplinary failure: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- Son: integer (nullable = true)
 |-- Social drinker: integer (nullable = true)
 |-- Social smoker: integer (nullable = true)
 |-- Pet: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Body mass index: integer (nullable = true)
 |-- label: integer (nullable = true)



In [6]:
# Tạo cột vector đặc trưng từ các cột không phải label
assem = VectorAssembler(inputCols=data_df.columns[:-1], outputCol='features')
data = assem.transform(data_df)

# Đánh chỉ mục cho cột label, sau đó fit trên toàn bộ dữ liệu
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

# Chia tập dữ liệu train test với tỉ lệ 70-30
train, test = data.randomSplit([0.7, 0.3],seed=1234)

# Decision Tree

In [7]:
# Thiết lập mô hình cây quyết định 
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")

# Thiết lập Pipeline với các bước labelIndexer và dt
pipeline = Pipeline(stages=[labelIndexer, dt])

# Fit model với tập train
model = pipeline.fit(train)

# Thực hiện dự đoán
predictions = model.transform(test)

# In ra các dòng kết quả dự đoán
predictions.select("prediction", "indexedLabel", "features").show(5)

# Tính độ chính xác của mô hình
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
treeModel = model.stages[1]

# In ra các kết quả
print(treeModel)
print("Decision Tree - Test Accuracy = %g" % (accuracy))
print("Decision Tree - Test Error = %g" % (1.0 - accuracy))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       5.0|         5.0|[1.0,0.0,3.0,5.0,...|
|       0.0|         0.0|[1.0,1.0,5.0,2.0,...|
|       0.0|         3.0|[1.0,13.0,1.0,3.0...|
|       0.0|         4.0|[1.0,26.0,10.0,2....|
|       5.0|         5.0|[2.0,0.0,6.0,2.0,...|
+----------+------------+--------------------+
only showing top 5 rows

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5065259f7c45, depth=5, numNodes=41, numClasses=19, numFeatures=20
Decision Tree - Test Accuracy = 0.418182
Decision Tree - Test Error = 0.581818


# Naive Bayes

In [8]:
# Khởi tạo mô hình Multinomial Naive Bayes
nb = NaiveBayes(featuresCol='features', labelCol='indexedLabel',smoothing=1.0, modelType="multinomial")

# Thiết lập pipeline gồm labelIndexer và Multinomial Naive Bayes
pipeline = Pipeline(stages=[labelIndexer, nb])

# Fit mô hình với tập train
model = pipeline.fit(train)

# Thực hiện dự đoán
predictions = model.transform(test)

# In ra các dòng kết quả dự đoán
predictions.select("prediction",'indexedLabel', "features").show(5)

# Tính toán độ chính xác của mô hình
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
nbModel = model.stages[1]

# In ra các kết quả
print(nbModel)  
print("Naive Bayes - Test set accuracy = " + str(accuracy))
print("Naive Bayes - Test Error = " + str(1.0 - accuracy))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       5.0|         5.0|[1.0,0.0,3.0,5.0,...|
|       5.0|         0.0|[1.0,1.0,5.0,2.0,...|
|       6.0|         3.0|[1.0,13.0,1.0,3.0...|
|       0.0|         4.0|[1.0,26.0,10.0,2....|
|       5.0|         5.0|[2.0,0.0,6.0,2.0,...|
+----------+------------+--------------------+
only showing top 5 rows

NaiveBayesModel: uid=NaiveBayes_80bce635b28c, modelType=multinomial, numClasses=17, numFeatures=20
Naive Bayes - Test set accuracy = 0.2909090909090909
Naive Bayes - Test Error = 0.7090909090909091


# RandomForest

In [9]:
# Khởi tạo mô hình RandomForrest
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)

# Thiết lập pipeline gồm labelIndexer và RandomForest
pipeline = Pipeline(stages=[labelIndexer, rf])

# Fit mô hình với tập train
model = pipeline.fit(train)

# Thực hiện dự đoán
predictions = model.transform(test)

# In ra các dòng kết quả
predictions.select("prediction", "indexedLabel", "features").show(5)

# Tính độ chính xác của mô hình
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
rfModel = model.stages[1]

# In ra các kết quả
print(rfModel)  
print("Random Forest - Test Accuracy = %g" % (accuracy))
print("Random Forest - Test Error = %g" % (1.0 - accuracy))

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       5.0|         5.0|[1.0,0.0,3.0,5.0,...|
|       0.0|         0.0|[1.0,1.0,5.0,2.0,...|
|       0.0|         3.0|[1.0,13.0,1.0,3.0...|
|       1.0|         4.0|[1.0,26.0,10.0,2....|
|       5.0|         5.0|[2.0,0.0,6.0,2.0,...|
+----------+------------+--------------------+
only showing top 5 rows

RandomForestClassificationModel: uid=RandomForestClassifier_9d15b7aec520, numTrees=10, numClasses=19, numFeatures=20
Random Forest - Test Accuracy = 0.481818
Random Forest - Test Error = 0.518182
