In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("PredictPrice").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/"
else:
    Path = "hdfs:/user/zeppelin/csvData/"

In [3]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id 
import jieba
import numpy as np
import random

In [4]:
productSchema = StructType([
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("url", StringType(), True),
    StructField("imgurl", StringType(), True),
    StructField("update_time", DateType(), True)])

productDf = spark.read.csv(Path+"3c_product.csv",header=False,schema=productSchema)

In [5]:
productDf.select("product_id", "category", "name", "price", "sale", "score", "update_time").show(5)

+----------+--------+--------------------+-----+----+-----+-----------+
|product_id|category|                name|price|sale|score|update_time|
+----------+--------+--------------------+-----+----+-----+-----------+
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-23|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-24|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-25|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-26|
| 100000411| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1|  5.0| 2017-12-23|
+----------+--------+--------------------+-----+----+-----+-----------+
only showing top 5 rows



In [6]:
dataDF = productDf.filter('category="iPhone充電傳輸"')
dataDF.show(5)

+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|          name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-23|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-24|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-25|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-26|
|  10003468|iPhone充電傳輸|    iPhone6手機殼|  250|  -1| -1.0|https://goo.gl/7y...|https://goo.gl/Ss...| 2017-12-23|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
only showi

In [7]:
productNameRDD = dataDF.select("name").rdd.distinct()
productNameRDD.take(5)

[Row(name='現貨蘋果安卓兩用USB數據線'),
 Row(name='rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232'),
 Row(name='Lightning對35公釐耳機插孔轉接器'),
 Row(name='iPhone66s66splus47寸55寸氣囊手機套手機殼'),
 Row(name='BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長')]

In [8]:
productNameRDD = productNameRDD.map(lambda x: x[0])
productNameRDD.take(5)

['現貨蘋果安卓兩用USB數據線',
 'rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232',
 'Lightning對35公釐耳機插孔轉接器',
 'iPhone66s66splus47寸55寸氣囊手機套手機殼',
 'BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長']

In [9]:
def split_jieba(line, cutMode):
    #轉小寫
    line = line.lower()
    #結巴切字
    jieba.load_userdict("jieba_dict/productDict.txt")
    seg_list = jieba.cut(line, cut_all=cutMode)
    ls = []
    for w in seg_list:
        ls.append(w)
    
    return ls

In [10]:
splitData = productNameRDD.map(lambda x: split_jieba(x, False))
splitData.take(5)

[['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'],
 ['rockspace',
  '一拖',
  '三',
  '充電線',
  'b',
  '款',
  '三合一',
  '2a',
  '快速',
  '充電',
  '傳輸線',
  '充電線',
  'type',
  '安卓',
  '蘋果',
  'a00232'],
 ['lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'],
 ['iphone6', '6s', '6', '6splus', '47', '寸', '55', '寸', '氣囊', '手機套', '手機殼'],
 ['baseus',
  '倍思',
  '機械',
  '時代',
  '蘋果',
  'iphone',
  'ios',
  '304',
  '不銹鋼',
  '數據傳輸線',
  '21a',
  '快速',
  '充電線',
  '金屬',
  '線',
  '1',
  '米長']]

In [11]:
splitData.saveAsTextFile("jieba_text")

Py4JJavaError: An error occurred while calling o105.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/jovyan/work/jieba_text already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1119)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1070)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1035)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:961)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:961)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:961)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:960)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1489)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1468)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1468)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1468)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [14]:
splitData2 = productNameRDD.map(lambda x: split_jieba(x, False))
splitData2.take(5)

[['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'],
 ['rockspace',
  '一拖',
  '三',
  '充電線',
  'b',
  '款',
  '三合一',
  '2a',
  '快速',
  '充電',
  '傳輸線',
  '充電線',
  'type',
  '安卓',
  '蘋果',
  'a00232'],
 ['lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'],
 ['iphone6', '6s', '6', '6splus', '47', '寸', '55', '寸', '氣囊', '手機套', '手機殼'],
 ['baseus',
  '倍思',
  '機械',
  '時代',
  '蘋果',
  'iphone',
  'ios',
  '304',
  '不銹鋼',
  '數據傳輸線',
  '21a',
  '快速',
  '充電線',
  '金屬',
  '線',
  '1',
  '米長']]

In [15]:
splitData2.count()

2954

In [None]:
splitData2.saveAsTextFile("jieba_text2")

# doc2Vec Testing

In [12]:
import gensim

In [16]:
doc2VecData = splitData2.zipWithIndex()

In [17]:
doc2VecData.take(3)

[(['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], 0),
 (['rockspace',
   '一拖',
   '三',
   '充電線',
   'b',
   '款',
   '三合一',
   '2a',
   '快速',
   '充電',
   '傳輸線',
   '充電線',
   'type',
   '安卓',
   '蘋果',
   'a00232'],
  1),
 (['lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'], 2)]

In [18]:
def word2LabledSentence(line):
    
    #sentences = []
    index = line[-1]
    wordList = line[0]
    
    sentence = gensim.models.doc2vec.TaggedDocument(list(wordList),[int(index)])
    #sentences.append(sentence)
    
    return sentence

In [19]:
doc2VecLabledRes = doc2VecData.map(lambda line: word2LabledSentence(line))

In [20]:
doc2Vec_train, doc2Vec_test = doc2VecLabledRes.randomSplit(weights=[0.7, 0.3])

In [21]:
doc2Vec_train.take(5)

[TaggedDocument(words=['現貨', '蘋果', '安卓', '兩用', 'usb', '數據線'], tags=[0]),
 TaggedDocument(words=['baseus', '倍思', '機械', '時代', '蘋果', 'iphone', 'ios', '304', '不銹鋼', '數據傳輸線', '21a', '快速', '充電線', '金屬', '線', '1', '米長'], tags=[4]),
 TaggedDocument(words=['安卓', '旅充'], tags=[5]),
 TaggedDocument(words=['iphone7', '二合一', '轉接線', '聽歌', '充電', '23cm', '充電線', '傳輸線', '數據線', '耳機線', 'apple', '餅乾', '盒子'], tags=[6]),
 TaggedDocument(words=['apple', '原廠', 'lightning', '1', '米', '傳輸', '充電線', '盒', '白色'], tags=[7])]

In [23]:
sentences = doc2VecLabledRes.collect()

In [None]:
sentences[0]

In [None]:
sentences[0].words

In [None]:
len(sentences)

In [25]:
sentences_train = doc2Vec_train.collect()
sentences_test = doc2Vec_test.collect()

In [None]:
sentences_train[5]

In [None]:
sentences_test[5]

In [24]:
doc2VecModel = gensim.models.Doc2Vec(vector_size = 100, window = 300, min_count = 10)
doc2VecModel.build_vocab(sentences)

In [26]:
doc2VecModel.train(sentences_train, total_examples=doc2VecModel.corpus_count, epochs=doc2VecModel.epochs)

In [None]:
doc2VecModel.save('doc2VecModel.txt')

## 多次訓練model

In [None]:
for i in range(doc2VecModel.epochs):
    random.shuffle(sentences_train)
    doc2VecModel.train(sentences_train, total_examples=doc2VecModel.corpus_count, epochs=doc2VecModel.epochs)

## 測試model

In [27]:
test_text = ['iphone6', '原廠頭', '原廠', '充電器', '原廠', '旅充頭', '豆腐頭', '小白頭'] 
test_text = [ttext.lower() for ttext in test_text]
print(test_text)
inferred_vector_dm = doc2VecModel.infer_vector(test_text)  
print(inferred_vector_dm)

['iphone6', '原廠頭', '原廠', '充電器', '原廠', '旅充頭', '豆腐頭', '小白頭']
[ 0.00104064  0.00875346 -0.00494684  0.01582915 -0.01129576 -0.00538387
 -0.00010502 -0.00941696 -0.01537951  0.00453918  0.00573319  0.00335362
  0.01418851 -0.00730898 -0.00172687  0.01586983  0.00114245 -0.01478707
 -0.0040171  -0.00699868 -0.00513742  0.0133495  -0.00651791  0.00659733
 -0.03482292 -0.00317227 -0.01277168 -0.01642768 -0.00223777 -0.01886079
  0.01690168  0.00183342 -0.0074257   0.00464228 -0.00178078  0.01182218
 -0.01808977  0.01031072 -0.00455862 -0.00278528  0.00655268 -0.00729899
 -0.01468212 -0.0115329  -0.00599601  0.00398471  0.01064988  0.03122716
  0.01174357 -0.00863213 -0.00787032 -0.01046648  0.00331481  0.00478667
  0.00074468 -0.01148947  0.01398122 -0.01073315 -0.01232549 -0.00488984
 -0.00170364 -0.00326912  0.00800699 -0.00570825  0.00383571  0.00604523
  0.01162691 -0.0072771   0.01250805  0.01721676  0.00900854  0.01622242
  0.00634637  0.01333471 -0.00365328 -0.00724058  0.01011912  0.0

In [28]:
sims = doc2VecModel.docvecs.most_similar([inferred_vector_dm], topn=10)

In [29]:
for count, sim in sims:  
    print(count)
    sentence = sentences[count]  
    words = ''  
    for word in sentence.words:  
        words = words + word + ' '  
    print (words, sim, len(sentence[0]))

672
台灣 代理 benks 小時代 黑 白色 雙 usb 口 快速 充電器 雙孔 usb 轉 接插 頭 插座 ac 充電器  0.9659518003463745 19
592
急 出貨 玫瑰金 電鍍 邊框 手機殼 透明 47 吋 55 吋 保護殼 保護套 手機套 for iphone6 6splus  0.9655072689056396 17
998
創意 手機 耳機線 保護套 糖果 色 耳機 收納 收線 繞線器 數據線 保護套 蘋果 專用  0.9646084308624268 14
1505
保證 原廠 29w 盒裝 快充頭 lightning to usb c 快充線 原廠線 iphonex 8plus  0.9644762277603149 13
1386
倍思 豌豆 數莢 磁吸線 夾據線 收納 集線器 便捷 創意 禮品 桌面 整理 卡扣  0.9643889665603638 13
2689
just mobile alu cable flat 鋁質 接頭 12 米 傳輸 扁線 lightning 蘋果 認證 充電線 4 色選擇  0.9643799066543579 17
967
479 款 iphone kitty 卡娜 赫拉 角落 生物 line 熊大兔 兔 指紋 辨識 貼 按鍵 貼 home 貼  0.9639114141464233 18
2282
公司 貨 garmmahellokitty 入耳式 麥克風 耳機 拉 鍊 式 耳機 可講 電話 線控 功能 來電 接聽  0.9639077186584473 16
1093
韓國 原裝 進口 正品 小叮鈴 line 熊大兔 兔莎莉 充電線 傳輸線 i7 iphone7plus i6s  0.9632796049118042 13
1563
蘋果 iphone6 6s 6plus 手機殼 防 摔 全包 超薄 矽膠 套 可愛 女 新款 47 特價 66  0.963187575340271 17


In [30]:
print ('examing')
doc_id = np.random.randint(doc2VecModel.docvecs.count)  # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
inferred_docvec = doc2VecModel.infer_vector(sentences[doc_id].words)
print('%s:\n %s' % (doc2VecModel, doc2VecModel.docvecs.most_similar([inferred_docvec], topn=3)))

examing
for doc 2572...
Doc2Vec(dm/m,d100,n5,w300,mc10,s0.001,t3):
 [(967, 0.9653408527374268), (212, 0.9621968865394592), (1249, 0.961844801902771)]


In [None]:
sentences[1112]

In [None]:
sentences[2283]