# 前置作業

In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("PredictPrice").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/"
else:
    Path = "hdfs:/user/zeppelin/csvData/"

In [4]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id 

# 準備資料

In [5]:
productSchema = StructType([
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("url", StringType(), True),
    StructField("imgurl", StringType(), True),
    StructField("update_time", DateType(), True)])

productDf = spark.read.csv(Path+"3c_product.csv",header=False,schema=productSchema)

In [6]:
productDf.select("product_id", "category", "name", "price", "sale", "score", "update_time").show(5)

+----------+--------+--------------------+-----+----+-----+-----------+
|product_id|category|                name|price|sale|score|update_time|
+----------+--------+--------------------+-----+----+-----+-----------+
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-23|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-24|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-25|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-26|
| 100000411| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1|  5.0| 2017-12-23|
+----------+--------+--------------------+-----+----+-----+-----------+
only showing top 5 rows



In [7]:
dataDF = productDf.filter('category="iPhone充電傳輸"')
dataDF.show(5)

+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|          name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-23|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-24|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-25|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-26|
|  10003468|iPhone充電傳輸|    iPhone6手機殼|  250|  -1| -1.0|https://goo.gl/7y...|https://goo.gl/Ss...| 2017-12-23|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
only showi

In [8]:
dataDF.count()

3479

In [10]:
dataDF = dataDF.filter('sale > 0')

In [11]:
dataDF.count()

724

In [12]:
dataDF.show(5)

+----------+----------+--------------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|                name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------------+-----+----+-----+--------------------+--------------------+-----------+
| 100069636|iPhone充電傳輸|rockspace一拖三充電線B款...|  290|   1|  4.9|https://goo.gl/Hh...|https://goo.gl/Sm...| 2017-12-25|
| 100069636|iPhone充電傳輸|rockspace一拖三充電線B款...|  290|   1|  4.9|https://goo.gl/Hh...|https://goo.gl/Sm...| 2017-12-26|
| 100142268|iPhone充電傳輸|USB數據線充電線適用於iPodS...|   61|   1|  4.8|https://goo.gl/QX...|https://goo.gl/zc...| 2017-12-25|
| 100142268|iPhone充電傳輸|USB數據線充電線適用於iPodS...|   61|   1|  4.8|https://goo.gl/QX...|https://goo.gl/zc...| 2017-12-26|
| 100206701|iPhone充電傳輸|現貨當天寄出保證原廠apple傳輸...|  250|   3|  4.9|https://goo.gl/vQ...|https://goo.gl/BS...| 2017-12-23|
+----------+----------+--------------------+-----+----+-----+-----------

# 第一種資料準備

In [28]:
def createDataRDD(data):
    price = data[3]
    score = round(data[5], 1)
    year = str(data[8])[0:4]
    month =str(data[8])[5:7]
    day = str(data[8])[8:10]
    
    return (year, month, day, score, price)

In [29]:
dataRDD = dataDF.rdd.map(createDataRDD)
dataRDD.take(5)

[('2017', '12', '25', 4.9, 290),
 ('2017', '12', '26', 4.9, 290),
 ('2017', '12', '25', 4.8, 61),
 ('2017', '12', '26', 4.8, 61),
 ('2017', '12', '23', 4.9, 250)]

In [30]:
def extract_label(r):
    label = (r[-1])
    return label

In [33]:
from pyspark.mllib.regression import LabeledPoint
labelpointRDD = dataRDD.map(lambda r:
    LabeledPoint(
        extract_label(r),
        r
    ))

print(labelpointRDD.first())

(290.0,[2017.0,12.0,25.0,4.9,290.0])


In [34]:
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])

In [35]:
trainData.count()

576

# 第二種資料準備

In [84]:
def createData2RDD(data):
    price = data[3]
    score = round(data[5], 1)
    month =str(data[8])[5:7]
    day = str(data[8])[8:10]
    
    return (month, day, score, price)

In [85]:
data2RDD = dataDF.rdd.map(createData2RDD)
data2RDD.take(5)

[('12', '25', 4.9, 290),
 ('12', '26', 4.9, 290),
 ('12', '25', 4.8, 61),
 ('12', '26', 4.8, 61),
 ('12', '23', 4.9, 250)]

In [86]:
def extract_label(r):
    label = (r[-1])
    return label

In [91]:
def extract_features(r):
    r = r[0:3]
    return r

In [92]:
labelpoint2RDD = data2RDD.map(lambda r:
    LabeledPoint(
        extract_label(r),
        extract_features(r)
    ))

print(labelpoint2RDD.first())

(290.0,[12.0,25.0,4.9])


In [95]:
(train2Data, validation2Data, test2Data) = labelpoint2RDD.randomSplit([8, 1, 1])

In [124]:
train2Data.take(5)

[LabeledPoint(290.0, [12.0,25.0,4.9]),
 LabeledPoint(290.0, [12.0,26.0,4.9]),
 LabeledPoint(61.0, [12.0,25.0,4.8]),
 LabeledPoint(61.0, [12.0,26.0,4.8]),
 LabeledPoint(250.0, [12.0,23.0,4.9])]

In [125]:
validation2Data.take(5)

[LabeledPoint(49.0, [12.0,24.0,5.0]),
 LabeledPoint(199.0, [12.0,23.0,4.4]),
 LabeledPoint(4.0, [12.0,23.0,5.0]),
 LabeledPoint(269.0, [12.0,23.0,4.8]),
 LabeledPoint(75.0, [12.0,25.0,5.0])]

# 第三種資料準備

In [69]:
data3RDD = dataDF.rdd.map(lambda x: (x[0], x[3], x[4], x[5], x[8]))
data3RDD.take(5)

[('100069636', 290, 1, 4.900000095367432, datetime.date(2017, 12, 25)),
 ('100069636', 290, 1, 4.900000095367432, datetime.date(2017, 12, 26)),
 ('100142268', 61, 1, 4.800000190734863, datetime.date(2017, 12, 25)),
 ('100142268', 61, 1, 4.800000190734863, datetime.date(2017, 12, 26)),
 ('100206701', 250, 3, 4.900000095367432, datetime.date(2017, 12, 23))]

In [71]:
data3IdRDD = dataDF.rdd.map(lambda x: x[0]).distinct()
data3IdRDD.take(5)

['100069636', '10022757', '100252125', '100299841', '100319526']

# Model 訓練

In [108]:
from pyspark.mllib.tree import DecisionTree

def trainModel(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm):
    startTime = time.time()
    model = DecisionTree.trainRegressor(trainData, 
                                        categoricalFeaturesInfo={}, \
                                        impurity=impurityParm,
                                        maxDepth=maxDepthParm,
                                        maxBins=maxBinsParm
                                       )
    (RMSE, score) = evaluateModel(model, validationData)
    duration = time.time() - startTime
    
    print( "訓練評估：使用參數"+ \
           " impurityParm=%s"%impurityParm+ \
           " maxDepthParm=%s"%maxDepthParm+ \
           " maxBinsParm=%s"%maxBinsParm+ \
           " 所需時間=%d"%duration+ \
           " RMSE = %f"%RMSE
         )
    
    return (RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model)

In [114]:
def trainModel2(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm):
    startTime = time.time()
    model = DecisionTree.trainRegressor(trainData, 
                                        categoricalFeaturesInfo={}, \
                                        impurity=impurityParm,
                                        maxDepth=maxDepthParm,
                                        maxBins=maxBinsParm
                                       )
    (RMSE, score) = evaluateModel(model, validationData)
    duration = time.time() - startTime
    
    print( "訓練評估：使用參數"+ \
           " impurityParm=%s"%impurityParm+ \
           " maxDepthParm=%s"%maxDepthParm+ \
           " maxBinsParm=%s"%maxBinsParm+ \
           " 所需時間=%d"%duration+ \
           " RMSE = %f"%RMSE
         )
    
    return (score, model)

In [50]:
from pyspark.mllib.evaluation import RegressionMetrics

def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    RMSE = metrics.rootMeanSquaredError
    return (RMSE, score)

In [145]:
def evalAllParameter(trainRDD, validationRDD, impurityList, maxDepthList, maxBinsList):
    metrics = [trainModel(trainData, validationData, impurity, maxdepth, maxBins)
        for impurity in impurityList
        for maxdepth in maxDepthList
        for maxBins in maxBinsList] 
    Smetrics = sorted(metrics, key=lambda k:k[0])
    bestParameter = Smetrics[0]
    
    print("最佳參數: ")
    print("impurity:" + str(bestParameter[2]))
    print("maxDepth:" + str(bestParameter[3]))
    print("maxBins:" + str(bestParameter[4]))
    print("RMSE:" + str(bestParameter[0]))
    
    return bestParameter[5]

# 測試data 1

In [98]:
(score, model) = trainModel(trainData, validationData, "variance", 10, 100)

訓練評估：使用參數impurityParm=variancemaxDepthParm=10maxBinsParm=100所需時間=2RMSE = 34.668808


In [49]:
validationData.take(5)[0].features

DenseVector([2017.0, 12.0, 24.0, 4.9, 250.0])

In [54]:
score.take(1)

[250.0]

# 測試data 2

In [102]:
def evaluate2model(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    RMSE = metrics.rootMeanSquaredError
    return (RMSE, score)

In [134]:
(score2, model2) = trainModel2(train2Data, validation2Data, "variance", 10, 200)

訓練評估：使用參數 impurityParm=variance maxDepthParm=10 maxBinsParm=200 所需時間=2 RMSE = 191.595483


In [135]:
model2.depth()

9

In [136]:
model2.numNodes()

75

In [None]:
model2.predict(testRDD.map(lambda p: p.features)).take(1)

In [143]:
def tryModel(trainRDD, validationRDD, impurityList, maxDepthList, maxBinsList):
    [trainModel2(train2Data, validation2Data, impurity, maxdepth, maxBins)
        for impurity in impurityList
        for maxdepth in maxDepthList
        for maxBins in maxBinsList] 

In [144]:
tryModel(train2Data, validation2Data, ["variance"], [3, 5, 10, 15, 20, 25], [3, 5, 10, 50, 100, 200])

訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=3 所需時間=2 RMSE = 156.944856
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=5 所需時間=2 RMSE = 156.886495
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=10 所需時間=1 RMSE = 156.886495
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=50 所需時間=1 RMSE = 156.881430
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=100 所需時間=2 RMSE = 156.881430
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=200 所需時間=2 RMSE = 156.881430
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=3 所需時間=1 RMSE = 156.944856
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=5 所需時間=1 RMSE = 155.926716
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=10 所需時間=1 RMSE = 159.480387
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=50 所需時間=1 RMSE = 164.763956
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=100 所需時間=1 RMSE = 164.763956
訓練評估：使用參數 impurityParm=variance maxDepthParm

In [146]:
bestModel = evalAllParameter(train2Data, validation2Data, ["variance"], [3, 5, 10, 15, 20, 25], [3, 5, 10, 50, 100, 200])

訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=3 所需時間=2 RMSE = 130.950588
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=5 所需時間=2 RMSE = 125.133881
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=10 所需時間=1 RMSE = 114.186917
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=50 所需時間=2 RMSE = 117.329505
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=100 所需時間=1 RMSE = 53.407289
訓練評估：使用參數 impurityParm=variance maxDepthParm=3 maxBinsParm=200 所需時間=2 RMSE = 48.844097
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=3 所需時間=1 RMSE = 131.352081
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=5 所需時間=1 RMSE = 126.171582
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=10 所需時間=1 RMSE = 111.725905
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=50 所需時間=1 RMSE = 106.528881
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 maxBinsParm=100 所需時間=1 RMSE = 36.055280
訓練評估：使用參數 impurityParm=variance maxDepthParm=5 

In [None]:
(score, bestTestModel2) = trainModel2(train2Data, validation2Data, "variance", 5, 5)

In [None]:
#--------------預測與結果----------------


In [113]:
bestModel.predict(testRDD.map(lambda p: p.features)).take(1)

[1.0]

In [126]:
bestModel.depth()

10

In [127]:
bestModel.numNodes()

353

In [55]:
dataDF.select("price").distinct().count()

200

In [78]:
testRDD = labelpointRDD = dataRDD.map(lambda r:
    LabeledPoint(
        0,
        (2017, 12, 24, 4.9, 0)
    ))


In [79]:
testRDD.take(1)[0].features

DenseVector([2017.0, 12.0, 24.0, 4.9, 0.0])

In [80]:
model.predict(testRDD.map(lambda p: p.features)).take(1)

[1.0]

In [105]:
def parametersEval(trainingRDD, validationRDD):
    evalParameter(trainingRDD, validationRDD, "maxDepth",
                    impurityList=["variance"],
                    maxDepthList=[3, 5, 10, 15, 20, 25],
                    maxBinsList=[10])
    
    evalParameter(trainingRDD, validationRDD, "maxBins",
                    impurityList=["variance"],
                    maxDepthList=[10],
                    maxBinsList=[3, 5, 10, 50, 100, 200])
    

In [None]:
def evalParameter(trainData, validationData, evaparm, impurityList, maxDepthList, maxBinsList):
    metrics = [trainModel(trainData, validationData, impurity, maxdepth, maxBins)
                  for impurity in impurityList
                  for maxdepth in maxDepthList
                  for maxBins in maxBinsList ]
    if evaparm == "impurity":
        IndexList = impurityList[:]
    elif evaparm == "maxDepth":
        IndexList = maxDepthList[:]
    elif evaparm == "maxBins":
        IndexList = maxBinsList[:]
    
    df = pd.DataFram(metrics, index=IndexList,
                    columns=['RMSE', 'duration', 'impurityParm', 'maxDepthParm', 'maxBinsParm', 'model'])
    ax = df['RMSE'.plot(kind="bar")]