In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("PredictPrice").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/"
else:
    Path = "hdfs:/user/zeppelin/csvData/"

In [169]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id 
import jieba
import numpy as np
import random

In [4]:
productSchema = StructType([
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("url", StringType(), True),
    StructField("imgurl", StringType(), True),
    StructField("update_time", DateType(), True)])

productDf = spark.read.csv(Path+"3c_product.csv",header=False,schema=productSchema)

In [5]:
productDf.select("product_id", "category", "name", "price", "sale", "score", "update_time").show(5)

+----------+--------+--------------------+-----+----+-----+-----------+
|product_id|category|                name|price|sale|score|update_time|
+----------+--------+--------------------+-----+----+-----+-----------+
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-23|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-24|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-25|
| 100000238| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1| -1.0| 2017-12-26|
| 100000411| Apple空機|迪士尼手機殼愛麗絲維尼史迪奇電鍍保...|  299|  -1|  5.0| 2017-12-23|
+----------+--------+--------------------+-----+----+-----+-----------+
only showing top 5 rows



In [6]:
dataDF = productDf.filter('category="iPhone充電傳輸"')
dataDF.show(5)

+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
|product_id|  category|          name|price|sale|score|                 url|              imgurl|update_time|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-23|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-24|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-25|
| 100033527|iPhone充電傳輸|現貨蘋果安卓兩用USB數據線|   99|  -1|  5.0|https://goo.gl/ea...|https://goo.gl/4k...| 2017-12-26|
|  10003468|iPhone充電傳輸|    iPhone6手機殼|  250|  -1| -1.0|https://goo.gl/7y...|https://goo.gl/Ss...| 2017-12-23|
+----------+----------+--------------+-----+----+-----+--------------------+--------------------+-----------+
only showi

In [7]:
productNameRDD = dataDF.select("name").rdd.distinct()
productNameRDD.take(5)

[Row(name='現貨蘋果安卓兩用USB數據線'),
 Row(name='rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232'),
 Row(name='Lightning對35公釐耳機插孔轉接器'),
 Row(name='iPhone66s66splus47寸55寸氣囊手機套手機殼'),
 Row(name='BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長')]

In [8]:
productNameRDD = productNameRDD.map(lambda x: x[0])
productNameRDD.take(5)

['現貨蘋果安卓兩用USB數據線',
 'rockspace一拖三充電線B款三合一2A快速充電傳輸線充電線TYPE安卓蘋果A00232',
 'Lightning對35公釐耳機插孔轉接器',
 'iPhone66s66splus47寸55寸氣囊手機套手機殼',
 'BASEUS倍思機械時代蘋果iPhoneiOS304不銹鋼數據傳輸線21A快速充電線金屬線1米長']

In [20]:
def split_jieba(line, cutMode):
    jieba.load_userdict("jieba_dict/productDict.txt")
    seg_list = jieba.cut(line, cut_all=cutMode)
    ls = []
    for w in seg_list:
        ls.append(w)
    
    return ls

In [11]:
splitData = productNameRDD.map(lambda x: split_jieba(x, False))
splitData.take(5)

[['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線'],
 ['rockspace',
  '一拖',
  '三充',
  '電線',
  'B',
  '款',
  '三合一',
  '2A',
  '快速',
  '充電傳',
  '輸線',
  '充電線',
  'TYPE',
  '安卓',
  '蘋果',
  'A00232'],
 ['Lightning', '對', '35', '公釐耳機', '插孔', '轉', '接器'],
 ['iPhone66s66splus47', '寸', '55', '寸氣囊', '手機', '套手', '機殼'],
 ['BASEUS',
  '倍',
  '思機械',
  '時代',
  '蘋果',
  'iPhoneiOS304',
  '不銹鋼',
  '數據',
  '傳輸線',
  '21A',
  '快速',
  '充電線',
  '金屬線',
  '1',
  '米長']]

In [15]:
splitData.saveAsTextFile("jieba_text")

In [34]:
splitData2 = productNameRDD.map(lambda x: split_jieba(x, False))
splitData2.take(5)

[['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線'],
 ['rockspace',
  '一拖',
  '三',
  '充電線',
  'B',
  '款',
  '三合一',
  '2A',
  '快速',
  '充電',
  '傳輸線',
  '充電線',
  'TYPE',
  '安卓',
  '蘋果',
  'A00232'],
 ['Lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'],
 ['iPhone6', '6s6', '6splus', '47', '寸', '55', '寸', '氣囊', '手機套', '手機殼'],
 ['BASEUS',
  '倍思',
  '機械',
  '時代',
  '蘋果',
  'iPhone',
  'iOS',
  '304',
  '不銹鋼',
  '數據',
  '傳輸線',
  '21A',
  '快速',
  '充電線',
  '金屬線',
  '1',
  '米長']]

In [64]:
splitData2.count()

2954

In [25]:
splitData2.saveAsTextFile("jieba_text2")

# doc2Vec Testing

In [31]:
import gensim

In [36]:
doc2VecData = splitData2.zipWithIndex()

In [49]:
doc2VecData.take(3)[0][0]

['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線']

In [103]:
def word2LabledSentence(line):
    
    #sentences = []
    index = line[-1]
    wordList = line[0]
    
    sentence = gensim.models.doc2vec.TaggedDocument(list(wordList),[int(index)])
    #sentences.append(sentence)
    
    return sentence

In [104]:
doc2VecLabledRes = doc2VecData.map(lambda line: word2LabledSentence(line))

In [165]:
doc2Vec_train, doc2Vec_test = doc2VecLabledRes.randomSplit(weights=[0.7, 0.3])

In [166]:
doc2Vec_train.take(5)

[TaggedDocument(words=['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線'], tags=[0]),
 TaggedDocument(words=['rockspace', '一拖', '三', '充電線', 'B', '款', '三合一', '2A', '快速', '充電', '傳輸線', '充電線', 'TYPE', '安卓', '蘋果', 'A00232'], tags=[1]),
 TaggedDocument(words=['Lightning', '對', '35', '公釐', '耳機', '插孔', '轉接器'], tags=[2]),
 TaggedDocument(words=['iPhone6', '6s6', '6splus', '47', '寸', '55', '寸', '氣囊', '手機套', '手機殼'], tags=[3]),
 TaggedDocument(words=['BASEUS', '倍思', '機械', '時代', '蘋果', 'iPhone', 'iOS', '304', '不銹鋼', '數據', '傳輸線', '21A', '快速', '充電線', '金屬線', '1', '米長'], tags=[4])]

In [105]:
sentences = doc2VecLabledRes.collect()

In [108]:
sentences[0]

TaggedDocument(words=['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線'], tags=[0])

In [134]:
sentences[0].words

['現貨', '蘋果', '安卓', '兩用', 'USB', '數據線']

In [91]:
len(sentences)

2954

In [167]:
sentences_train = doc2Vec_train.collect()
sentences_test = doc2Vec_test.collect()

In [170]:
doc2VecModel = gensim.models.Doc2Vec(vector_size = 100, window = 300, min_count = 10)
doc2VecModel.build_vocab(sentences)

In [None]:
doc2VecModel.train(sentence_train, total_examples=doc2VecModel.corpus_count, epochs=doc2VecModel.epochs)

In [164]:
doc2VecModel.save('doc2VecModel.txt')

## 多次訓練model

In [168]:
for epochs in doc2VecModel.epochs:
    random.shuffle(sentences_train)
    doc2VecModel.train(sentence_train, total_examples=doc2VecModel.corpus_count, epochs=doc2VecModel.epochs)

5


## 測試model

In [144]:
test_text = ['IPhone6', '原廠頭', '原廠', '充電器', '原廠', '旅充頭', '豆腐頭', '小白頭'] 
inferred_vector_dm = doc2VecModel.infer_vector(test_text)  
print(inferred_vector_dm)

[ -8.19119625e-03   8.98340251e-03  -1.45160183e-02   3.52894366e-02
   2.78613176e-02   4.19940799e-02   1.90387610e-02   1.86485760e-02
  -2.77819391e-03  -6.45066379e-03   2.13023126e-02  -5.28194336e-03
  -4.22797129e-02  -7.30575575e-03   1.62484832e-02  -2.30166949e-02
   1.88346263e-02  -8.35506897e-03  -1.64519809e-02  -2.08902601e-02
  -3.12363449e-02  -1.37819545e-02  -1.34840915e-02  -1.28541040e-04
   1.60880201e-02   5.04784146e-03   1.08779809e-02   1.06303785e-02
  -1.30325472e-02   2.33874493e-03   1.53745078e-02  -1.62833985e-02
   9.21956077e-03   2.98496019e-02   2.61471011e-02  -1.29526863e-02
  -4.54749800e-02   8.97833332e-03   1.82300285e-02   2.14700494e-02
   6.71666069e-03  -1.29926195e-02  -7.81702809e-03  -1.51374061e-02
   2.27618702e-02   5.08051645e-03   1.83797926e-02   2.10271291e-02
  -8.51967931e-03   1.06016472e-02  -3.14428993e-02   2.46486701e-02
  -1.09694125e-02  -2.51980079e-03  -8.19945801e-03   1.20796952e-02
   1.32218553e-02  -1.08749662e-02

In [149]:
sims = doc2VecModel.docvecs.most_similar([inferred_vector_dm], topn=10)

In [146]:
for count, sim in sims:  
    sentence = sentences[count]  
    words = ''  
    for word in sentence.words:  
        words = words + word + ' '  
    print (words, sim, len(sentence[0]))

i4 硬殼  0.975570797920227 2
MARVEL iPhoneSE i5i5s 復 仇者 聯盟 時尚 電鍍 保護 軟套 DN1AM5  0.9748216271400452 11
MARVEL iPhone6 6s 復 仇者 聯盟 2D 立體 保護套 DN1AM6  0.9737220406532288 10
miki 醬 手機 水鑽殼 防塵塞 福袋 35mm iphone 344S 三星 S2notei9220 nokia N8 Sony 蝴蝶 結 珍珠 水 鑽 可愛 浪漫 手機套  0.9713416695594788 22
MELODY 按鍵 貼 指紋 按鍵 貼 iPhone 系列  0.9697436094284058 8
艾瑪 防水 布包 直立 手機 袋 經典 紅格紋 ZakkaUmahana 防水 包 防水 斜 背包 防水 手機 袋  0.9696629643440247 17
Iphone 透明 鋼化 玻璃 膜 9H  0.9690421223640442 6
MARVEL iPhone 6Plus 6sPlus 復 仇者 聯盟 2D 立體 保護套 DN1AMP  0.9675761461257935 11
優惠價 Apple WatchSeries1 軍規 防 摔 防水 42mm 黑黑  0.9674739837646484 9
HAO 授權 代理 HAO 鏡頭 框 iPhone6 S 6Plus iPhone6 S6 鏡頭 保護框  0.9673609733581543 13


In [158]:
print ('examing')
doc_id = np.random.randint(doc2VecModel.docvecs.count)  # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
inferred_docvec = doc2VecModel.infer_vector(sentences[doc_id].words)
print('%s:\n %s' % (doc2VecModel, doc2VecModel.docvecs.most_similar([inferred_docvec], topn=3)))

examing
for doc 2639...
Doc2Vec(dm/m,d100,n5,w300,mc10,s0.001,t3):
 [(665, 0.9527255296707153), (2719, 0.9526031613349915), (271, 0.9502585530281067)]


In [160]:
sentences[2639]

TaggedDocument(words=['PQI', '蘋果', '認證', 'MFi', '圓線', 'Lightning', '傳輸', '100cm', '充電', 'iPhone6', '78X'], tags=[2639])

In [159]:
sentences[665]

TaggedDocument(words=['西屯彩殼', 'AvierLinePro', 'Lightning', '極速', '鋅', '合金', '編織線', '快速', '充電', '鋅', '合金', '1M'], tags=[665])