# initial spark

In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("PredictPrice").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/"
else:
    Path = "hdfs:/user/zeppelin/csvData/"

# import lib

In [212]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id 
import jieba
import numpy as np
import random
from pyspark.mllib.regression import LabeledPoint

# Work!

In [4]:
productSchema = StructType([
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("url", StringType(), True),
    StructField("imgurl", StringType(), True),
    StructField("update_time", DateType(), True)])

productDf = spark.read.csv(Path+"3c_product.csv",header=False,schema=productSchema)

In [5]:
productDf.select("product_id", "category", "name", "price", "sale", "score", "update_time").show(5)

+----------+--------+--------------------+-----+----+-----+-----------+
|product_id|category|                name|price|sale|score|update_time|
+----------+--------+--------------------+-----+----+-----+-----------+
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-23|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-24|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-25|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-26|
| 100000411| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1|  5.0| 2017-12-23|
+----------+--------+--------------------+-----+----+-----+-----------+
only showing top 5 rows



In [74]:
dataDF = productDf.filter('category="iPhone充電傳輸"')
dataDF.show(5)

+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|          name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-23|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-24|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-25|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-26|
|  10003468|iPhone充電傳輸|    iPhone6手機殼|  250|  -1| -1.0|https://goo.gl/7y...|https://goo.gl/Ss...| 2017-12-23|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
only showi

In [8]:
productNameRDD = dataDF.select("name").rdd.distinct()
productNameRDD.take(5)

[Row(name='現貨蘋果安卓兩用USB數據線'),
 Row(name='rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232'),
 Row(name='Lightning對35公釐耳機插孔轉接器'),
 Row(name='iPhone66s66splus47寸55寸氣囊手機套手機殼'),
 Row(name='BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長')]

In [9]:
productNameRDD = productNameRDD.map(lambda x: x[0])
productNameRDD.take(5)

['現貨蘋果安卓兩用USB數據線',
 'rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232',
 'Lightning對35公釐耳機插孔轉接器',
 'iPhone66s66splus47寸55寸氣囊手機套手機殼',
 'BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長']

# train data 準備

In [286]:
def createDataRDD(data):
    name = data[2]
    price = data[3]
    score = round(data[5], 1)
    year = str(data[8])[0:4]
    month =str(data[8])[5:7]
    day = str(data[8])[8:10]
    
    #jieba
    #轉小寫
    name = name.lower()
    #結巴切字
    jieba.load_userdict("jieba_dict/productDict.txt")
    seg_list = jieba.cut(name, cut_all=False)
    ls = []
    for w in seg_list:
        ls.append(w)
    
    
    
    return (ls, year, month, day, score, price)

In [301]:
dataRDD = dataDF.rdd.map(createDataRDD)
dataRDD.take(5)

[(['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], '2017', '12', '23', 5.0, 99),
 (['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], '2017', '12', '24', 5.0, 99),
 (['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], '2017', '12', '25', 5.0, 99),
 (['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], '2017', '12', '26', 5.0, 99),
 (['iphone6', '手機殼'], '2017', '12', '23', -1.0, 250)]

# Make Token

In [289]:
def wordCountData(line):
    wordList=[]
    for word in line:
        wordList.append((word, 1))
        
    return wordList

In [302]:
splitData = dataRDD.map(lambda x: tuple(x[0])).distinct()
splitData.take(3)

[('現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'),
 ('rockspace',
  '一拖',
  '三',
  '充電線',
  'b',
  '款',
  '三合一',
  '2a',
  '快速',
  '充電',
  '傳輸線',
  '充電線',
  'type',
  '安卓',
  '蘋果',
  'a00232'),
 ('粗線',
  'apple',
  '原廠',
  'iphone8',
  'iphone7plus',
  '原廠',
  '充電線',
  '原廠',
  '傳輸線',
  '6s',
  '6splus',
  'ipx')]

In [303]:
splitData.count()

2943

In [304]:
tokenTempRDD = splitData.flatMap(wordCountData)
tokenTempRDD.take(3)

[('現貨', 1), ('蘋果', 1), ('安卓', 1)]

In [305]:
tfTempRDD = tokenTempRDD.reduceByKey(add)
tfTempRDD.take(5)

[('現貨', 282), ('蘋果', 843), ('安卓', 297), ('兩用', 34), ('usb', 452)]

In [306]:
tfTempRDD.saveAsTextFile("tfTemp")

In [307]:
tfTempRDD.count()

4860

In [308]:
tfTempRDD.filter(lambda x: x[1]>10).count()

446

In [309]:
tfTempRDD.filter(lambda x: x[1]>10).saveAsTextFile("tfTempFilter")

In [38]:
def seqToken(line):
    return (line[0],)*line[1]

In [310]:
tokenRDD = tfTempRDD.filter(lambda x: x[1]>10).flatMap(seqToken)

In [311]:
tokenRDD.distinct().count()

446

In [312]:
tokenList = tokenRDD.distinct().collect()

### import tf-idf lib

In [46]:
from pyspark.mllib.feature import HashingTF, IDF

In [66]:
def saveToken(line):
    wordList=[]
    for word in line:
        if word in tokenList:
            wordList.append(word)
    
    return wordList

In [313]:
textData = splitData.map(saveToken).zipWithIndex()

In [314]:
textData.take(3)

[(['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], 0),
 (['充電線', '款', '三合一', '2a', '快速', '充電', '傳輸線', '充電線', '安卓', '蘋果'], 1),
 (['apple',
   '原廠',
   'iphone8',
   'iphone7plus',
   '原廠',
   '充電線',
   '原廠',
   '傳輸線',
   '6s',
   '6splus'],
  2)]

In [315]:
def textPair(line):
    textLine=''
    for word in line[0]:
        textLine=textLine+word+' '
    textLine = textLine.strip(' ')
    
    return (textLine, line[1]) 

In [316]:
textData.map(textPair).take(3)

[('現貨 蘋果 安卓 兩用 usb 數據線', 0),
 ('充電線 款 三合一 2a 快速 充電 傳輸線 充電線 安卓 蘋果', 1),
 ('apple 原廠 iphone8 iphone7plus 原廠 充電線 原廠 傳輸線 6s 6splus', 2)]

In [317]:
bcTextData = sc.broadcast(textData.map(textPair).collectAsMap())

In [319]:
bcTextData.value['充電線 款 三合一 2a 快速 充電 傳輸線 充電線 安卓 蘋果']

1

In [321]:
bcTextData.value

{'現貨 蘋果 安卓 兩用 usb 數據線': 0,
 '充電線 款 三合一 2a 快速 充電 傳輸線 充電線 安卓 蘋果': 1,
 'apple 原廠 iphone8 iphone7plus 原廠 充電線 原廠 傳輸線 6s 6splus': 2,
 'iphone6 6s 6 6splus 47 55 手機套 手機殼': 3,
 '現貨 蘋果 安卓 高質 感 手機 座充': 4,
 '': 2577,
 'lightning microusb 快充 24a 鋁合金 充電 傳輸線 100cm': 6,
 'baseus 倍思 蘋果 iphone ios 21a 快速 充電線 金屬 線 1': 7,
 'iphone7 二合一 轉接線 聽歌 充電 充電線 傳輸線 數據線 耳機線 apple': 8,
 '三合一 快速 充電 iphone 三星 充電線 傳輸線': 9,
 'lightning 線 iphone7 7plus 專用 手機 耳機 車載': 10,
 '15 米 蘋果 ios golf 高速 傳輸 充電線 二合一 充電線 21a': 11,
 '2 米 蘋果 ios golf 高速 傳輸 充電線 二合一 充電線 21a': 12,
 '15 米 款 蘋果 ios golf 高速 傳輸 21a 充電線 二合一': 13,
 '傳輸線 充電線 安卓 ios microusb 蘋果 lighting 二合一 快速 充電 iphone5 6s': 14,
 '安卓 microusb 蘋果 lighting 二合一 iphone5 6s 傳輸線 充電線 ios': 15,
 'iphone 4 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭': 16,
 'iphone5 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭': 17,
 'iphone5s 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭': 18,
 'iphone6s 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭 6s i6s': 19,
 'iphone 6plus 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭 i6': 20,
 'iphone7 原廠 頭 原廠 充電器 原廠 旅充頭 豆腐頭 小白頭 i7': 21,
 'iphone 4 原廠 傳輸線 充電線 

In [322]:
hashingTF = HashingTF(2**10)

In [178]:
def docToTF(line):

    tf = hashingTF.transform(line[0])

    return (line[1], tf)


In [323]:
textDataTfRDD = textData.map(docToTF)

In [324]:
textDataTfRDD.take(3)

[(0,
  SparseVector(1024, {28: 1.0, 82: 1.0, 278: 1.0, 562: 1.0, 822: 1.0, 920: 1.0})),
 (1,
  SparseVector(1024, {28: 1.0, 262: 1.0, 452: 1.0, 506: 2.0, 685: 1.0, 754: 1.0, 822: 1.0, 839: 1.0, 1018: 1.0})),
 (2,
  SparseVector(1024, {89: 1.0, 165: 3.0, 343: 1.0, 506: 1.0, 507: 1.0, 585: 1.0, 685: 1.0, 753: 1.0}))]

In [325]:
textDataTfRDD.cache()

PythonRDD[520] at RDD at PythonRDD.scala:48

In [326]:
idf = IDF().fit(textDataTfRDD.values())
idf

<pyspark.mllib.feature.IDFModel at 0x7fec6c2d6320>

In [327]:
tfidf = idf.transform(textDataTfRDD.values())

In [328]:
tfidf.take(3)

[SparseVector(1024, {28: 1.3463, 82: 4.5535, 278: 2.3817, 562: 1.9617, 822: 2.2607, 920: 2.3492}),
 SparseVector(1024, {28: 1.3463, 262: 4.4322, 452: 2.9702, 506: 2.4705, 685: 1.4041, 754: 3.8286, 822: 2.2607, 839: 4.1589, 1018: 1.696}),
 SparseVector(1024, {89: 4.0557, 165: 6.5112, 343: 2.1211, 506: 1.2353, 507: 1.8117, 585: 3.768, 685: 1.4041, 753: 2.5494})]

In [330]:
tfidf = tfidf.zipWithIndex()
tfidf.take(3)

[(SparseVector(1024, {28: 1.3463, 82: 4.5535, 278: 2.3817, 562: 1.9617, 822: 2.2607, 920: 2.3492}),
  0),
 (SparseVector(1024, {28: 1.3463, 262: 4.4322, 452: 2.9702, 506: 2.4705, 685: 1.4041, 754: 3.8286, 822: 2.2607, 839: 4.1589, 1018: 1.696}),
  1),
 (SparseVector(1024, {89: 4.0557, 165: 6.5112, 343: 2.1211, 506: 1.2353, 507: 1.8117, 585: 3.768, 685: 1.4041, 753: 2.5494}),
  2)]

In [331]:
tfidf = tfidf.map(lambda line: (line[1], line[0]))

In [332]:
tfidf.take(3)

[(0,
  SparseVector(1024, {28: 1.3463, 82: 4.5535, 278: 2.3817, 562: 1.9617, 822: 2.2607, 920: 2.3492})),
 (1,
  SparseVector(1024, {28: 1.3463, 262: 4.4322, 452: 2.9702, 506: 2.4705, 685: 1.4041, 754: 3.8286, 822: 2.2607, 839: 4.1589, 1018: 1.696})),
 (2,
  SparseVector(1024, {89: 4.0557, 165: 6.5112, 343: 2.1211, 506: 1.2353, 507: 1.8117, 585: 3.768, 685: 1.4041, 753: 2.5494}))]

In [333]:
tfidf_pairs = sc.broadcast(tfidf.collectAsMap())

In [334]:
tfidf_pairs.value

{0: SparseVector(1024, {28: 1.3463, 82: 4.5535, 278: 2.3817, 562: 1.9617, 822: 2.2607, 920: 2.3492}),
 1: SparseVector(1024, {28: 1.3463, 262: 4.4322, 452: 2.9702, 506: 2.4705, 685: 1.4041, 754: 3.8286, 822: 2.2607, 839: 4.1589, 1018: 1.696}),
 2: SparseVector(1024, {89: 4.0557, 165: 6.5112, 343: 2.1211, 506: 1.2353, 507: 1.8117, 585: 3.768, 685: 1.4041, 753: 2.5494}),
 3: SparseVector(1024, {175: 3.269, 189: 3.768, 197: 4.2033, 343: 2.1211, 508: 1.9665, 571: 4.8965, 753: 2.5494, 853: 5.2795}),
 4: SparseVector(1024, {19: 4.9918, 28: 1.3463, 383: 4.0755, 822: 2.2607, 869: 2.0713, 920: 2.3492, 962: 4.3499}),
 5: SparseVector(1024, {}),
 6: SparseVector(1024, {52: 3.7248, 124: 3.0177, 335: 3.4877, 584: 3.0972, 626: 5.1543, 679: 2.328, 685: 1.4041, 1018: 1.696}),
 7: SparseVector(1024, {28: 1.3463, 97: 3.0675, 255: 1.1196, 321: 4.274, 452: 2.9702, 506: 1.2353, 581: 3.3924, 590: 4.2033, 720: 2.9636, 832: 4.2499, 978: 3.2601}),
 8: SparseVector(1024, {278: 2.3817, 424: 2.0931, 506: 1.2353, 

In [335]:
tfidf.count()

2943

In [336]:
tfidf.distinct().count()

2943

In [337]:
tfidf.take(3)

[(0,
  SparseVector(1024, {28: 1.3463, 82: 4.5535, 278: 2.3817, 562: 1.9617, 822: 2.2607, 920: 2.3492})),
 (1,
  SparseVector(1024, {28: 1.3463, 262: 4.4322, 452: 2.9702, 506: 2.4705, 685: 1.4041, 754: 3.8286, 822: 2.2607, 839: 4.1589, 1018: 1.696})),
 (2,
  SparseVector(1024, {89: 4.0557, 165: 6.5112, 343: 2.1211, 506: 1.2353, 507: 1.8117, 585: 3.768, 685: 1.4041, 753: 2.5494}))]

# 準備train data

In [350]:
def productNameToVector(line):
    name = line[0]
    
    #try:
    line_index = bcTextData.value[name]
    vector = tfidf_pairs.value[line_index]
    res = tuple(vector.toArray()) + (line[1],) + (line[2],) + (line[3],) + (line[4],) + (line[5],)
    '''
    except KeyError:
        vector =[]
        for i in range(1024):
            vector.append(0.0)
        res = tuple(vector) + (0,) + (0,) + (0,) + (0,) + (0,)
    '''
    
    
    
    
    return res

In [339]:
def changeDataRDD(data):
    name = data[0]
    
    ls=""
    for w in name:
        if w in tokenList:
            ls = ls+w+" "
    
    ls=ls.strip()
    
    
    return (ls, data[1], data[2], data[3], data[4], data[5])

#  model train