In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1d2e66502042aef1da670d1888eb8750097e266df4ffa7c23a8923ec50bca80f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import pandas as pd

import pyspark

from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline



from pyspark.ml.feature import StringIndexer, VectorIndexer, StringIndexerModel, IndexToString
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [5]:
train = spark.read.format("csv").load("train.csv",header = 'True',inferSchema='True')
test = spark.read.format("csv").load("test.csv",header = 'True',inferSchema='True')

In [6]:
train.show(5)

+------+------------+-----------------+---------------+--------------+----------+------+-------------+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+
| 10001|         low|             high|              4|             4|       med|   med|          acc|
| 10002|         med|              med|              2|             4|     small|   med|          acc|
| 10003|         low|            vhigh|              4|             2|       med|   low|        unacc|
| 10004|        high|             high|              2|          more|     small|   med|        unacc|
| 10005|         low|             high|              3|             2|       med|   low|        unacc|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+
only showing top 5 rows



In [7]:
#convert string to index
labelIndexer = StringIndexer(inputCol="acceptability", outputCol="label")
buyIndexer = StringIndexer(inputCol="buying_price", outputCol="indexedBuyPrice")

In [15]:
indexedLabelTrain=labelIndexer.fit(train).transform(train)

In [16]:
indexedLabelTrain.show(5)

+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|label|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+
| 10001|         low|             high|              4|             4|       med|   med|          acc|  1.0|
| 10002|         med|              med|              2|             4|     small|   med|          acc|  1.0|
| 10003|         low|            vhigh|              4|             2|       med|   low|        unacc|  0.0|
| 10004|        high|             high|              2|          more|     small|   med|        unacc|  0.0|
| 10005|         low|             high|              3|             2|       med|   low|        unacc|  0.0|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+
only showing top 5 

In [17]:
df1 = buyIndexer.fit(indexedLabelTrain).transform(indexedLabelTrain)
df1.show(2)

+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|label|indexedBuyPrice|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+
| 10001|         low|             high|              4|             4|       med|   med|          acc|  1.0|            2.0|
| 10002|         med|              med|              2|             4|     small|   med|          acc|  1.0|            1.0|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+
only showing top 2 rows



In [18]:
#  muốn tính toán gì phải chuyển qua dạn vector
assembler = VectorAssembler(inputCols=["indexedBuyPrice"],outputCol="features")
df2 = assembler.transform(df1)
df2.show(2)

+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|label|indexedBuyPrice|features|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+
| 10001|         low|             high|              4|             4|       med|   med|          acc|  1.0|            2.0|   [2.0]|
| 10002|         med|              med|              2|             4|     small|   med|          acc|  1.0|            1.0|   [1.0]|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+
only showing top 2 rows



In [19]:
#  chọn mô hình xong chạy
decission_tree_classifier_model2 = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxDepth=10)
decission_tree_classifier_model2.fit(df2).transform(df2).show(5)

+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+--------------------+--------------------+----------+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|label|indexedBuyPrice|features|       rawPrediction|         probability|prediction|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+--------------------+--------------------+----------+
| 10001|         low|             high|              4|             4|       med|   med|          acc|  1.0|            2.0|   [2.0]|[1015.0,306.0,56....|[0.71078431372549...|       0.0|
| 10002|         med|              med|              2|             4|     small|   med|          acc|  1.0|            1.0|   [1.0]|[1015.0,306.0,56....|[0.71078431372549...|       0.0|
| 10003|         low|            vhigh|              4|          

In [20]:
#  tạo pipeline như thế này là ổn
pipeline = Pipeline( stages = [buyIndexer, assembler,decission_tree_classifier_model2, ])

In [21]:
model = pipeline.fit(indexedLabelTrain)
model.transform(indexedLabelTrain).show()

+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+--------------------+--------------------+----------+
|car_id|buying_price|maintenance_price|number_of_doors|carry_capacity|trunk_size|safety|acceptability|label|indexedBuyPrice|features|       rawPrediction|         probability|prediction|
+------+------------+-----------------+---------------+--------------+----------+------+-------------+-----+---------------+--------+--------------------+--------------------+----------+
| 10001|         low|             high|              4|             4|       med|   med|          acc|  1.0|            2.0|   [2.0]|[1015.0,306.0,56....|[0.71078431372549...|       0.0|
| 10002|         med|              med|              2|             4|     small|   med|          acc|  1.0|            1.0|   [1.0]|[1015.0,306.0,56....|[0.71078431372549...|       0.0|
| 10003|         low|            vhigh|              4|          

In [22]:
# tạo sao cho giống cái sample
testSolutions = pipeline.fit(indexedLabelTrain).transform(test).select('car_id','prediction')
testSolutions.show()

labelsArray = ["unacc","acc","good","vgood"]
testSolutions = IndexToString(inputCol="prediction", outputCol="acceptability", labels = labelsArray).transform(testSolutions)
testSolutions.show()


solutions = testSolutions.select('car_id','acceptability')
solutions.show()
solutions.toPandas().to_csv("dumpsolutions.csv",header=True, index=False)

+------+----------+
|car_id|prediction|
+------+----------+
| 11429|       0.0|
| 11430|       0.0|
| 11431|       0.0|
| 11432|       0.0|
| 11433|       0.0|
| 11434|       0.0|
| 11435|       0.0|
| 11436|       0.0|
| 11437|       0.0|
| 11438|       0.0|
| 11439|       0.0|
| 11440|       0.0|
| 11441|       0.0|
| 11442|       0.0|
| 11443|       0.0|
| 11444|       0.0|
| 11445|       0.0|
| 11446|       0.0|
| 11447|       0.0|
| 11448|       0.0|
+------+----------+
only showing top 20 rows

+------+----------+-------------+
|car_id|prediction|acceptability|
+------+----------+-------------+
| 11429|       0.0|        unacc|
| 11430|       0.0|        unacc|
| 11431|       0.0|        unacc|
| 11432|       0.0|        unacc|
| 11433|       0.0|        unacc|
| 11434|       0.0|        unacc|
| 11435|       0.0|        unacc|
| 11436|       0.0|        unacc|
| 11437|       0.0|        unacc|
| 11438|       0.0|        unacc|
| 11439|       0.0|        unacc|
| 11440|       0.0