In [None]:
# https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier

In [1]:
import findspark

findspark.init()

In [2]:
# spark 생성

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
data = spark.read.csv("hdfs://localhost:19000/data/model_data.csv", header="true", inferSchema="true")

In [4]:
data.show()

+---+-----+-----+-----+-----+-----+------+---------+---------+
|win|topwr| jgwr|midwr|adcwr|supwr|teamwr|recentwr1|recentwr2|
+---+-----+-----+-----+-----+-----+------+---------+---------+
|  0| 50.0| 50.0| 41.2| 50.0| 45.7|  38.5|     66.7|    100.0|
|  0| 44.8| 33.3| 47.0| 45.8| 43.3|  36.7|     57.1|     71.4|
|  0| 28.6| 29.2| 66.7| 22.2| 22.2|  22.2|     50.0|    100.0|
|  0| 35.3| 66.7| 85.7| 50.0| 58.3|  35.3|     50.0|     57.1|
|  0| 55.6| 42.1| 50.0| 40.0| 69.2|  53.3|     85.7|     33.3|
|  1| 92.3| 88.2| 66.7| 92.3| 84.6|  92.3|     37.5|     57.1|
|  0| 85.7| 50.0| 60.0| 66.7| 90.0|  70.1|     75.0|     14.3|
|  1| 47.6| 60.0| 61.9| 50.0| 60.0|  50.0|     71.4|     40.0|
|  1| 75.0|100.0| 92.9|100.0| 78.9|  70.6|    100.0|     54.5|
|  0| 40.0| 40.0| 50.0| 50.0| 50.0|  57.1|     30.0|     50.0|
|  1| 80.0| 83.3| 62.1| 75.0| 71.4|  60.6|     42.9|     25.0|
|  0| 38.9| 40.0| 66.7| 50.0| 50.0|  53.6|     62.5|     28.6|
|  0| 80.0| 50.0| 58.6| 40.0| 40.5|  54.0|     42.9|   

In [21]:
import numpy as np
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [8]:
# VectorAssembler
assembler = VectorAssembler(
    inputCols=data.columns[1:],
    outputCol="features")

# 데이터 분리
train_data, test_data = data.randomSplit(weights=[0.8, 0.2],  seed=13)

# DecisionTree
dt = DecisionTreeClassifier(labelCol="win", featuresCol="features")

# Pipeline
pipeline = Pipeline(stages=[assembler, dt])

# 모델 훈련
model = pipeline.fit(train_data)

# 모델 예측
pred = model.transform(test_data)

In [9]:
pred.select('win','rawPrediction','probability','prediction').toPandas()

Unnamed: 0,win,rawPrediction,probability,prediction
0,0,"[19.0, 1.0]","[0.95, 0.05]",0.0
1,0,"[19.0, 1.0]","[0.95, 0.05]",0.0
2,0,"[0.0, 6.0]","[0.0, 1.0]",1.0
3,0,"[38.0, 14.0]","[0.7307692307692307, 0.2692307692307692]",0.0
4,0,"[0.0, 1.0]","[0.0, 1.0]",1.0
5,0,"[19.0, 1.0]","[0.95, 0.05]",0.0
6,0,"[19.0, 1.0]","[0.95, 0.05]",0.0
7,0,"[38.0, 14.0]","[0.7307692307692307, 0.2692307692307692]",0.0
8,0,"[38.0, 14.0]","[0.7307692307692307, 0.2692307692307692]",0.0
9,0,"[0.0, 6.0]","[0.0, 1.0]",1.0


In [13]:
# ROC
 
bcEvaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='win', metricName="areaUnderROC")
bcEvaluator.evaluate(pred)

0.4900793650793651

In [10]:
# 정확도

mcEvaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='win', metricName="accuracy")
mcEvaluator.evaluate(pred)

0.5

In [12]:
model.stages

[VectorAssembler_4666f0b76067,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d5df6ba4f461, depth=5, numNodes=31, numClasses=2, numFeatures=8]

In [211]:
# 모델 튜닝

paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(dt.maxBins, [10, 20, 40, 80, 100])
             .build())

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=bcEvaluator,
    numFolds=3
)

In [212]:
# Run cross validations.

# maxdepth=5, maxbins=32 >> 0.5
# maxdepth=4, maxbins=32 >> 0.585
# maxdepth=4, maxbins=34 >> 0.591
# maxdepth=10, maxbins=20 >> 0.617

cvModel = cv.fit(train_data)
lr_cv_predictions = cvModel.transform(test_data)
bcEvaluator.evaluate(lr_cv_predictions)

0.6170634920634921

In [213]:
cvModel.getEstimatorParamMaps()

[{Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
  Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 10},
 {Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
  Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 20},
 {Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.')

In [214]:
# Best Model

cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]

{Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
 Param(parent='DecisionTreeClassifier_d5df6ba4f461', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 20}