# initial spark

In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("PredictPrice").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/"
else:
    Path = "hdfs:/user/zeppelin/csvData/"

# import lib

In [7]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id 
import jieba
import numpy as np
import random

# Work!

In [4]:
productSchema = StructType([
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("url", StringType(), True),
    StructField("imgurl", StringType(), True),
    StructField("update_time", DateType(), True)])

productDf = spark.read.csv(Path+"3c_product.csv",header=False,schema=productSchema)

In [5]:
productDf.select("product_id", "category", "name", "price", "sale", "score", "update_time").show(5)

+----------+--------+--------------------+-----+----+-----+-----------+
|product_id|category|                name|price|sale|score|update_time|
+----------+--------+--------------------+-----+----+-----+-----------+
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-23|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-24|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-25|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-26|
| 100000411| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1|  5.0| 2017-12-23|
+----------+--------+--------------------+-----+----+-----+-----------+
only showing top 5 rows



In [74]:
dataDF = productDf.filter('category="iPhone充電傳輸"')
dataDF.show(5)

+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|          name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-23|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-24|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-25|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-26|
|  10003468|iPhone充電傳輸|    iPhone6手機殼|  250|  -1| -1.0|https://goo.gl/7y...|https://goo.gl/Ss...| 2017-12-23|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
only showi

In [8]:
productNameRDD = dataDF.select("name").rdd.distinct()
productNameRDD.take(5)

[Row(name='現貨蘋果安卓兩用USB數據線'),
 Row(name='rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232'),
 Row(name='Lightning對35公釐耳機插孔轉接器'),
 Row(name='iPhone66s66splus47寸55寸氣囊手機套手機殼'),
 Row(name='BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長')]

In [9]:
productNameRDD = productNameRDD.map(lambda x: x[0])
productNameRDD.take(5)

['現貨蘋果安卓兩用USB數據線',
 'rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232',
 'Lightning對35公釐耳機插孔轉接器',
 'iPhone66s66splus47寸55寸氣囊手機套手機殼',
 'BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長']

In [10]:
def split_jieba(line, cutMode):
    #轉小寫
    line = line.lower()
    #結巴切字
    jieba.load_userdict("jieba_dict/productDict.txt")
    seg_list = jieba.cut(line, cut_all=cutMode)
    ls = []
    for w in seg_list:
        ls.append(w)
    
    return ls

In [11]:
splitData = productNameRDD.map(lambda x: split_jieba(x, False))
splitData.take(5)

[['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'],
 ['rockspace',
  '一拖',
  '三',
  '充電線',
  'b',
  '款',
  '三合一',
  '2a',
  '快速',
  '充電',
  '傳輸線',
  '充電線',
  'type',
  '安卓',
  '蘋果',
  'a00232'],
 ['lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'],
 ['iphone6', '6s', '6', '6splus', '47', '寸', '55', '寸', '氣囊', '手機套', '手機殼'],
 ['baseus',
  '倍思',
  '機械',
  '時代',
  '蘋果',
  'iphone',
  'ios',
  '304',
  '不銹鋼',
  '數據傳輸線',
  '21a',
  '快速',
  '充電線',
  '金屬',
  '線',
  '1',
  '米長']]

# Make Token

In [17]:
def wordCountData(line):
    wordList=[]
    for word in line:
        wordList.append((word, 1))
        
    return wordList

In [22]:
tokenTempRDD = splitData.flatMap(wordCountData)
tokenTempRDD.take(3)

[('現貨', 1), ('蘋果', 1), ('安卓', 1)]

In [23]:
tfTempRDD = tokenTempRDD.reduceByKey(add)
tfTempRDD.take(5)

[('現貨', 282), ('蘋果', 843), ('安卓', 298), ('兩用', 34), ('usb', 452)]

In [24]:
tfTempRDD.saveAsTextFile("tfTemp")

In [25]:
tfTempRDD.count()

4872

In [28]:
tfTempRDD.filter(lambda x: x[1]>10).count()

447

In [47]:
tfTempRDD.filter(lambda x: x[1]>10).saveAsTextFile("tfTempFilter")

In [38]:
def seqToken(line):
    return (line[0],)*line[1]

In [43]:
tokenRDD = tfTempRDD.filter(lambda x: x[1]>10).flatMap(seqToken)

In [45]:
tokenRDD.distinct().count()

447

In [50]:
tokenList = tokenRDD.distinct().collect()

### import tf-idf lib

In [46]:
from pyspark.mllib.feature import HashingTF, IDF

In [66]:
def saveToken(line):
    wordList=[]
    for word in line:
        if word in tokenList:
            wordList.append(word)
    
    return wordList

In [102]:
textData = splitData.map(saveToken).zipWithIndex()

In [103]:
textData.take(3)

[(['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], 0),
 (['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'], 1),
 (['lightning', '對', '耳機', '轉接器'], 2)]

In [108]:
hashingTF = HashingTF(2**10)

In [109]:
def docToTF(line):

    tf = hashingTF.transform(line[0])

    return (line[1], tf)


In [112]:
textDataTfRDD = textData.map(docToTF)

In [113]:
textDataTfRDD.take(3)

[(0,
  SparseVector(1024, {28: 1.0, 82: 1.0, 278: 1.0, 562: 1.0, 822: 1.0, 920: 1.0})),
 (1,
  SparseVector(1024, {28: 1.0, 262: 1.0, 452: 1.0, 506: 2.0, 685: 1.0, 754: 1.0, 822: 1.0, 839: 1.0, 1018: 1.0})),
 (2, SparseVector(1024, {276: 1.0, 298: 1.0, 637: 1.0, 679: 1.0}))]

In [116]:
textDataTfRDD.take(3)[0][1]

SparseVector(1024, {28: 1.0, 82: 1.0, 278: 1.0, 562: 1.0, 822: 1.0, 920: 1.0})

In [114]:
textDataTfRDD.cache()

PythonRDD[141] at RDD at PythonRDD.scala:48

In [119]:
textDataTfRDD.value

AttributeError: 'PipelinedRDD' object has no attribute 'value'

In [117]:
bcTextDataTf = sc.broadcast(textDataTfRDD.collectAsMap())

In [118]:
bcTextDataTf.value

{0: SparseVector(1024, {28: 1.0, 82: 1.0, 278: 1.0, 562: 1.0, 822: 1.0, 920: 1.0}),
 1: SparseVector(1024, {28: 1.0, 262: 1.0, 452: 1.0, 506: 2.0, 685: 1.0, 754: 1.0, 822: 1.0, 839: 1.0, 1018: 1.0}),
 2: SparseVector(1024, {276: 1.0, 298: 1.0, 637: 1.0, 679: 1.0}),
 3: SparseVector(1024, {175: 1.0, 189: 1.0, 197: 1.0, 343: 1.0, 508: 1.0, 571: 1.0, 753: 1.0, 853: 1.0}),
 4: SparseVector(1024, {28: 1.0, 97: 1.0, 255: 1.0, 321: 1.0, 452: 1.0, 506: 1.0, 581: 1.0, 590: 1.0, 720: 1.0, 832: 1.0, 978: 1.0}),
 5: SparseVector(1024, {822: 1.0, 949: 1.0}),
 6: SparseVector(1024, {278: 1.0, 424: 1.0, 506: 1.0, 507: 1.0, 553: 1.0, 629: 1.0, 633: 1.0, 685: 1.0, 790: 1.0, 1018: 1.0}),
 7: SparseVector(1024, {51: 1.0, 165: 1.0, 287: 1.0, 424: 1.0, 506: 1.0, 507: 1.0, 679: 1.0, 978: 1.0}),
 8: SparseVector(1024, {51: 1.0, 84: 1.0, 441: 1.0, 626: 1.0, 679: 1.0, 685: 1.0, 707: 1.0, 1018: 1.0}),
 9: SparseVector(1024, {28: 1.0, 97: 1.0, 287: 1.0, 424: 1.0, 441: 1.0, 506: 2.0, 553: 1.0, 665: 1.0, 832: 1.0,

In [None]:
idf = IDF().fit(tf)


tfidf = idf.transform(tf)

In [70]:
tfidf.count()

2954

In [71]:
tfidf.distinct().count()

2712

In [79]:
tfidf.take(3)

[SparseVector(1024, {28: 1.3501, 82: 4.5573, 278: 2.3855, 562: 1.9654, 822: 2.2612, 920: 2.3529}),
 SparseVector(1024, {28: 1.3501, 262: 4.4359, 452: 2.974, 506: 2.464, 685: 1.4023, 754: 3.8324, 822: 2.2612, 839: 4.1626, 1018: 1.6997}),
 SparseVector(1024, {276: 5.1009, 298: 3.1872, 637: 4.0994, 679: 2.3318})]

# train data 準備

In [58]:
def createDataRDD(data):
    name = data[2]
    price = data[3]
    score = round(data[5], 1)
    year = str(data[8])[0:4]
    month =str(data[8])[5:7]
    day = str(data[8])[8:10]
    
    #jieba
    #轉小寫
    name = name.lower()
    #結巴切字
    jieba.load_userdict("jieba_dict/productDict.txt")
    seg_list = jieba.cut(name, cut_all=False)
    ls = []
    for w in seg_list:
        if w in tokenList:
            ls.append(w)
    
    
    
    return (ls, year, month, day, score, price)

In [75]:
dataDF2 = dataDF.filter('sale > 0')

In [95]:
dataRDD = dataDF2.rdd.map(createDataRDD)
dataRDD.take(5)

[(['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'],
  '2017',
  '12',
  '25',
  4.9,
  290),
 (['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'],
  '2017',
  '12',
  '26',
  4.9,
  290),
 (['usb', '數據線', '充電線', '適用', '於', 'ipod', '代', '6', '代'],
  '2017',
  '12',
  '25',
  4.8,
  61),
 (['usb', '數據線', '充電線', '適用', '於', 'ipod', '代', '6', '代'],
  '2017',
  '12',
  '26',
  4.8,
  61),
 (['現貨',
   '保證',
   '原廠',
   'apple',
   '傳輸線',
   '充電頭',
   '耳機',
   'iphone8',
   '7',
   '6splus',
   '5s',
   'lightning'],
  '2017',
  '12',
  '23',
  4.9,
  250)]

In [106]:
def productTokenToVector(line):
    Token = line[0]
    TokenTF = hashingTF.transform(Token)
    TokenTFIDF = idf.transform(TokenTF)
    
    return line

In [107]:
dataRDD2 = dataRDD.map(productTokenToVector)
dataRDD2.take(3)

[(['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'],
  '2017',
  '12',
  '25',
  4.9,
  290),
 (['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'],
  '2017',
  '12',
  '26',
  4.9,
  290),
 (['usb', '數據線', '充電線', '適用', '於', 'ipod', '代', '6', '代'],
  '2017',
  '12',
  '25',
  4.8,
  61)]

In [77]:
def extract_label(r):
    label = (r[-1])
    return label

In [78]:
def extract_features(r):
    r = r[0:3]
    return r