In [None]:
# https://techblog-history-younghunjo1.tistory.com/500
# https://parkaparka.tistory.com/30
# https://docs.microsoft.com/ko-kr/azure/hdinsight/spark/apache-spark-creating-ml-pipelines

In [None]:
# 독립변수 묶기
# 학습데이터, 테스트데이터 나누기
# 모델 생성
# 모델 평가


# 파이프라인 사용하기

In [1]:
import findspark

findspark.init()

In [2]:
# spark 생성

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
data = spark.read.csv("hdfs://localhost:19000/data/model_data.csv", header="true", inferSchema="true")

In [4]:
data.show()

+---+-----+-----+-----+-----+-----+------+---------+---------+
|win|topwr| jgwr|midwr|adcwr|supwr|teamwr|recentwr1|recentwr2|
+---+-----+-----+-----+-----+-----+------+---------+---------+
|  0| 50.0| 50.0| 41.2| 50.0| 45.7|  38.5|     66.7|    100.0|
|  0| 44.8| 33.3| 47.0| 45.8| 43.3|  36.7|     57.1|     71.4|
|  0| 28.6| 29.2| 66.7| 22.2| 22.2|  22.2|     50.0|    100.0|
|  0| 35.3| 66.7| 85.7| 50.0| 58.3|  35.3|     50.0|     57.1|
|  0| 55.6| 42.1| 50.0| 40.0| 69.2|  53.3|     85.7|     33.3|
|  1| 92.3| 88.2| 66.7| 92.3| 84.6|  92.3|     37.5|     57.1|
|  0| 85.7| 50.0| 60.0| 66.7| 90.0|  70.1|     75.0|     14.3|
|  1| 47.6| 60.0| 61.9| 50.0| 60.0|  50.0|     71.4|     40.0|
|  1| 75.0|100.0| 92.9|100.0| 78.9|  70.6|    100.0|     54.5|
|  0| 40.0| 40.0| 50.0| 50.0| 50.0|  57.1|     30.0|     50.0|
|  1| 80.0| 83.3| 62.1| 75.0| 71.4|  60.6|     42.9|     25.0|
|  0| 38.9| 40.0| 66.7| 50.0| 50.0|  53.6|     62.5|     28.6|
|  0| 80.0| 50.0| 58.6| 40.0| 40.5|  54.0|     42.9|   

In [5]:
data.printSchema()

root
 |-- win: integer (nullable = true)
 |-- topwr: double (nullable = true)
 |-- jgwr: double (nullable = true)
 |-- midwr: double (nullable = true)
 |-- adcwr: double (nullable = true)
 |-- supwr: double (nullable = true)
 |-- teamwr: double (nullable = true)
 |-- recentwr1: double (nullable = true)
 |-- recentwr2: double (nullable = true)



In [12]:
# 독립변수 묶기

# https://spark.apache.org/docs/latest/ml-features.html#vectorassembler

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=data.columns[1:],
    outputCol="features")

data = assembler.transform(data)

In [13]:
data.show()

+---+-----+-----+-----+-----+-----+------+---------+---------+--------------------+
|win|topwr| jgwr|midwr|adcwr|supwr|teamwr|recentwr1|recentwr2|            features|
+---+-----+-----+-----+-----+-----+------+---------+---------+--------------------+
|  0| 50.0| 50.0| 41.2| 50.0| 45.7|  38.5|     66.7|    100.0|[50.0,50.0,41.2,5...|
|  0| 44.8| 33.3| 47.0| 45.8| 43.3|  36.7|     57.1|     71.4|[44.8,33.3,47.0,4...|
|  0| 28.6| 29.2| 66.7| 22.2| 22.2|  22.2|     50.0|    100.0|[28.6,29.2,66.7,2...|
|  0| 35.3| 66.7| 85.7| 50.0| 58.3|  35.3|     50.0|     57.1|[35.3,66.7,85.7,5...|
|  0| 55.6| 42.1| 50.0| 40.0| 69.2|  53.3|     85.7|     33.3|[55.6,42.1,50.0,4...|
|  1| 92.3| 88.2| 66.7| 92.3| 84.6|  92.3|     37.5|     57.1|[92.3,88.2,66.7,9...|
|  0| 85.7| 50.0| 60.0| 66.7| 90.0|  70.1|     75.0|     14.3|[85.7,50.0,60.0,6...|
|  1| 47.6| 60.0| 61.9| 50.0| 60.0|  50.0|     71.4|     40.0|[47.6,60.0,61.9,5...|
|  1| 75.0|100.0| 92.9|100.0| 78.9|  70.6|    100.0|     54.5|[75.0,100.0,92

In [14]:
# 데이터 분리

train_data, test_data = data.randomSplit(weights=[0.8, 0.2],  seed=13)

In [22]:
train_data.count()

158

In [23]:
test_data.count()

32

In [15]:
# 로지스틱 회귀분석

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='win')

lr_model = lr.fit(train_data)

In [17]:
# 테스트데이터에 대한 예측

pred = lr_model.transform(test_data)

In [27]:
pred.select('win','rawPrediction','probability','prediction').toPandas()

Unnamed: 0,win,rawPrediction,probability,prediction
0,0,"[1.3683959656235989, -1.3683959656235989]","[0.7971208740610419, 0.20287912593895804]",0.0
1,0,"[0.6985993605712739, -0.6985993605712739]","[0.6678771592555396, 0.3321228407444604]",0.0
2,0,"[0.6178171766138001, -0.6178171766138001]","[0.649721936831523, 0.35027806316847704]",0.0
3,0,"[1.9303388118306966, -1.9303388118306966]","[0.8732869164792559, 0.12671308352074404]",0.0
4,0,"[1.497266231006423, -1.497266231006423]","[0.817166390211472, 0.18283360978852806]",0.0
5,0,"[0.5510452150224172, -0.5510452150224172]","[0.6343780548691779, 0.3656219451308222]",0.0
6,0,"[0.7738908314563719, -0.7738908314563719]","[0.6843619570929256, 0.3156380429070744]",0.0
7,0,"[0.48141872397671914, -0.48141872397671914]","[0.6180828297658795, 0.38191717023412053]",0.0
8,0,"[1.0910490169098521, -1.0910490169098521]","[0.7485792068569689, 0.25142079314303106]",0.0
9,0,"[0.40753504669753493, -0.40753504669753493]","[0.600496682273926, 0.3995033177260739]",0.0


In [28]:
pred.toPandas()

Unnamed: 0,win,topwr,jgwr,midwr,adcwr,supwr,teamwr,recentwr1,recentwr2,features,rawPrediction,probability,prediction
0,0,0.0,7.1,50.0,0.0,10.0,0.0,66.7,75.0,"[0.0, 7.1, 50.0, 0.0, 10.0, 0.0, 66.7, 75.0]","[1.3683959656235989, -1.3683959656235989]","[0.7971208740610419, 0.20287912593895804]",0.0
1,0,0.0,33.3,40.0,0.0,40.0,42.1,11.1,85.7,"[0.0, 33.3, 40.0, 0.0, 40.0, 42.1, 11.1, 85.7]","[0.6985993605712739, -0.6985993605712739]","[0.6678771592555396, 0.3321228407444604]",0.0
2,0,0.0,66.7,57.1,66.7,44.4,26.1,0.0,20.0,"[0.0, 66.7, 57.1, 66.7, 44.4, 26.1, 0.0, 20.0]","[0.6178171766138001, -0.6178171766138001]","[0.649721936831523, 0.35027806316847704]",0.0
3,0,25.7,15.4,12.5,65.2,18.8,23.8,50.0,75.0,"[25.7, 15.4, 12.5, 65.2, 18.8, 23.8, 50.0, 75.0]","[1.9303388118306966, -1.9303388118306966]","[0.8732869164792559, 0.12671308352074404]",0.0
4,0,27.8,0.0,8.3,0.0,23.5,31.3,58.3,54.5,"[27.8, 0.0, 8.3, 0.0, 23.5, 31.3, 58.3, 54.5]","[1.497266231006423, -1.497266231006423]","[0.817166390211472, 0.18283360978852806]",0.0
5,0,28.6,29.2,66.7,22.2,22.2,22.2,50.0,100.0,"[28.6, 29.2, 66.7, 22.2, 22.2, 22.2, 50.0, 100.0]","[0.5510452150224172, -0.5510452150224172]","[0.6343780548691779, 0.3656219451308222]",0.0
6,0,28.6,50.0,41.7,0.0,50.0,19.0,50.0,100.0,"[28.6, 50.0, 41.7, 0.0, 50.0, 19.0, 50.0, 100.0]","[0.7738908314563719, -0.7738908314563719]","[0.6843619570929256, 0.3156380429070744]",0.0
7,0,31.6,55.2,44.0,64.1,47.6,50.0,100.0,83.3,"[31.6, 55.2, 44.0, 64.1, 47.6, 50.0, 100.0, 83.3]","[0.48141872397671914, -0.48141872397671914]","[0.6180828297658795, 0.38191717023412053]",0.0
8,0,38.9,37.5,28.6,50.0,50.0,34.4,45.5,50.0,"[38.9, 37.5, 28.6, 50.0, 50.0, 34.4, 45.5, 50.0]","[1.0910490169098521, -1.0910490169098521]","[0.7485792068569689, 0.25142079314303106]",0.0
9,0,44.4,44.4,50.0,50.0,50.0,37.8,45.5,11.1,"[44.4, 44.4, 50.0, 50.0, 50.0, 37.8, 45.5, 11.1]","[0.40753504669753493, -0.40753504669753493]","[0.600496682273926, 0.3995033177260739]",0.0


In [34]:
# 정확도

In [31]:
# ROC

from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
bcEvaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='win', metricName="areaUnderROC")
bcEvaluator.evaluate(pred)

0.6468253968253969

In [30]:
# 정확도

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

mcEvaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='win', metricName="accuracy")
mcEvaluator.evaluate(pred)

0.6875