In [1]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

In [20]:
indexed = spark.read.csv("playtennis.csv", header = True)

In [3]:
type(df)

pyspark.sql.dataframe.DataFrame

In [21]:
indexed.show()

+--------+-----------+--------+------+----------+
| Outlook|Temperature|Humidity|  Wind|PlayTennis|
+--------+-----------+--------+------+----------+
|   Sunny|        Hot|    High|  Weak|        No|
|   Sunny|        Hot|    High|Strong|        No|
|Overcast|        Hot|    High|  Weak|       Yes|
|    Rain|       Mild|    High|  Weak|       Yes|
|    Rain|       Cool|  Normal|  Weak|       Yes|
|    Rain|       Cool|  Normal|Strong|        No|
|Overcast|       Cool|  Normal|Strong|       Yes|
|   Sunny|       Mild|    High|  Weak|        No|
|   Sunny|       Cool|  Normal|  Weak|       Yes|
|    Rain|       Mild|  Normal|  Weak|       Yes|
|   Sunny|       Mild|  Normal|Strong|       Yes|
|Overcast|       Mild|    High|Strong|       Yes|
|Overcast|        Hot|  Normal|  Weak|       Yes|
|    Rain|       Mild|    High|Strong|        No|
+--------+-----------+--------+------+----------+



In [5]:
from pyspark.sql import functions as F

In [6]:
df.printSchema()

root
 |-- Outlook: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Humidity: string (nullable = true)
 |-- Wind: string (nullable = true)
 |-- PlayTennis: string (nullable = true)



In [46]:
indexed.columns

['Outlook',
 'Temperature',
 'Humidity',
 'Wind',
 'PlayTennis',
 'Outlook_indexed',
 'Temperature_indexed',
 'Humidity_indexed',
 'Wind_indexed',
 'PlayTennis_indexed']

In [23]:

from pyspark.ml.feature import StringIndexer

# 이름을 하나하나 안넣어주고 indexed.columns 해주면 됩니다.
categorical_columns= ['Outlook','Temperature', 'Humidity','Wind', 'PlayTennis']

indexers = [
    StringIndexer(inputCol=c, outputCol="{}_indexed".format(c))
    for c in categorical_columns
]

for indexer in indexers :
    indexed = indexer.fit(indexed).transform(indexed)
    
indexed.show()

+--------+-----------+--------+------+----------+---------------+-------------------+----------------+------------+------------------+
| Outlook|Temperature|Humidity|  Wind|PlayTennis|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|
+--------+-----------+--------+------+----------+---------------+-------------------+----------------+------------+------------------+
|   Sunny|        Hot|    High|  Weak|        No|            0.0|                1.0|             0.0|         0.0|               1.0|
|   Sunny|        Hot|    High|Strong|        No|            0.0|                1.0|             0.0|         1.0|               1.0|
|Overcast|        Hot|    High|  Weak|       Yes|            2.0|                1.0|             0.0|         0.0|               0.0|
|    Rain|       Mild|    High|  Weak|       Yes|            1.0|                0.0|             0.0|         0.0|               0.0|
|    Rain|       Cool|  Normal|  Weak|       Yes|      

In [26]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['Outlook_indexed','Temperature_indexed', 'Humidity_indexed','Wind_indexed'], outputCol = 'features')

In [27]:
assemble_indexer = assembler.transform(indexed)
assemble_indexer.show()

+--------+-----------+--------+------+----------+---------------+-------------------+----------------+------------+------------------+-----------------+
| Outlook|Temperature|Humidity|  Wind|PlayTennis|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|         features|
+--------+-----------+--------+------+----------+---------------+-------------------+----------------+------------+------------------+-----------------+
|   Sunny|        Hot|    High|  Weak|        No|            0.0|                1.0|             0.0|         0.0|               1.0|    (4,[1],[1.0])|
|   Sunny|        Hot|    High|Strong|        No|            0.0|                1.0|             0.0|         1.0|               1.0|[0.0,1.0,0.0,1.0]|
|Overcast|        Hot|    High|  Weak|       Yes|            2.0|                1.0|             0.0|         0.0|               0.0|[2.0,1.0,0.0,0.0]|
|    Rain|       Mild|    High|  Weak|       Yes|            1.0|                0

In [35]:
df = assemble_indexer['Outlook_indexed','Temperature_indexed', 'Humidity_indexed','Wind_indexed', 'PlayTennis_indexed', 'features']

In [36]:
df.show()

+---------------+-------------------+----------------+------------+------------------+-----------------+
|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|         features|
+---------------+-------------------+----------------+------------+------------------+-----------------+
|            0.0|                1.0|             0.0|         0.0|               1.0|    (4,[1],[1.0])|
|            0.0|                1.0|             0.0|         1.0|               1.0|[0.0,1.0,0.0,1.0]|
|            2.0|                1.0|             0.0|         0.0|               0.0|[2.0,1.0,0.0,0.0]|
|            1.0|                0.0|             0.0|         0.0|               0.0|    (4,[0],[1.0])|
|            1.0|                2.0|             1.0|         0.0|               0.0|[1.0,2.0,1.0,0.0]|
|            1.0|                2.0|             1.0|         1.0|               1.0|[1.0,2.0,1.0,1.0]|
|            2.0|                2.0|             1.0| 

In [37]:
(train_data, test_data) = df.randomSplit([0.7, 0.3])

In [38]:
train_data.show()

+---------------+-------------------+----------------+------------+------------------+-----------------+
|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|         features|
+---------------+-------------------+----------------+------------+------------------+-----------------+
|            0.0|                0.0|             0.0|         0.0|               1.0|        (4,[],[])|
|            0.0|                0.0|             1.0|         1.0|               0.0|[0.0,0.0,1.0,1.0]|
|            1.0|                0.0|             0.0|         0.0|               0.0|    (4,[0],[1.0])|
|            1.0|                0.0|             0.0|         1.0|               1.0|[1.0,0.0,0.0,1.0]|
|            1.0|                0.0|             1.0|         0.0|               0.0|[1.0,0.0,1.0,0.0]|
|            1.0|                2.0|             1.0|         0.0|               0.0|[1.0,2.0,1.0,0.0]|
|            1.0|                2.0|             1.0| 

In [39]:
from pyspark.ml.classification import DecisionTreeClassifier

In [41]:
dt = DecisionTreeClassifier(featuresCol = "features", labelCol = "PlayTennis_indexed")

In [42]:
dtmodel = dt.fit(train_data)

In [43]:
dtmodel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_9e1cebc2a5d0) of depth 4 with 13 nodes

In [44]:
prediction = dtmodel.transform(test_data)

In [45]:
prediction.show()

+---------------+-------------------+----------------+------------+------------------+-----------------+-------------+-----------+----------+
|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|         features|rawPrediction|probability|prediction|
+---------------+-------------------+----------------+------------+------------------+-----------------+-------------+-----------+----------+
|            0.0|                1.0|             0.0|         0.0|               1.0|    (4,[1],[1.0])|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|            0.0|                1.0|             0.0|         1.0|               1.0|[0.0,1.0,0.0,1.0]|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|            0.0|                2.0|             1.0|         0.0|               0.0|[0.0,2.0,1.0,0.0]|    [1.0,0.0]|  [1.0,0.0]|       0.0|
|            2.0|                1.0|             0.0|         0.0|               0.0|[2.0,1.0,0.0,0.0]|    [3.0,0.0]|  [1.0,0.0]|       0.0|
+-----

In [47]:
right_predict = prediction.filter(prediction.PlayTennis_indexed == prediction.prediction)

In [48]:
right_predict.show()

+---------------+-------------------+----------------+------------+------------------+-----------------+-------------+-----------+----------+
|Outlook_indexed|Temperature_indexed|Humidity_indexed|Wind_indexed|PlayTennis_indexed|         features|rawPrediction|probability|prediction|
+---------------+-------------------+----------------+------------+------------------+-----------------+-------------+-----------+----------+
|            0.0|                1.0|             0.0|         0.0|               1.0|    (4,[1],[1.0])|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|            0.0|                1.0|             0.0|         1.0|               1.0|[0.0,1.0,0.0,1.0]|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|            0.0|                2.0|             1.0|         0.0|               0.0|[0.0,2.0,1.0,0.0]|    [1.0,0.0]|  [1.0,0.0]|       0.0|
|            2.0|                1.0|             0.0|         0.0|               0.0|[2.0,1.0,0.0,0.0]|    [3.0,0.0]|  [1.0,0.0]|       0.0|
+-----

In [49]:
right_predict.count()

4

In [50]:
prediction.count()

4

In [51]:
# 정답률
right_predict.count() / prediction.count()

1.0