# 前置作業

In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("HotSale").getOrCreate()

In [2]:
global Path
if sc.master[0:5] == "local":
    Path = "file:/home/jovyan/work/csvData/CSV/"
else:
    Path = "hdfs:/user/zeppelin/csvData/CSV/"

In [3]:
from operator import add
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType
import time
import math
import datetime
from pyspark.sql.functions import monotonically_increasing_id, array_contains
import re

In [4]:
productSchema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("shop_id", IntegerType(), True),
    StructField("shop_name", StringType(), True),
    StructField("category_name", StringType(), True),
    StructField("name", StringType(), True),
    StructField("hashtag", StringType(), True),
    StructField("trade_in", FloatType(), True),
    StructField("price", IntegerType(), True),
    StructField("sale", IntegerType(), True),
    StructField("score_avg", FloatType(), True),
    StructField("score_num", IntegerType(), True),
    StructField("stock", IntegerType(), True),
    StructField("need_day", IntegerType(), True),
    StructField("url", StringType(), True),
    StructField("content", StringType(), True),
    StructField("update_time", DateType(), True)])

#productDf = spark.read.csv(Path+"2018-04-01 shopee_product.csv",header=False,schema=productSchema,quote='')

In [5]:
def makeRDD(line):
    line = line.split(",")
    
    if line[0] != "item_id":
        for i in range(len(line)):
            if line[i]=="":
                line[i]=0
                
        return [int(line[0]), int(line[1]), line[2], line[3], line[4], line[5], float(line[6]), int(line[7]), int(line[8]), float(line[9]), int(line[10]), int(line[11]), int(line[12]), line[13], line[14]]
    
    else:
        return line

In [6]:
productRDD = sc.parallelize([])

In [7]:
for i in range(1,12):
    if i <10:
        date = "2018-04-0"+str(i)
        filename = Path + date +" shopee_product.csv"
    elif 10 <= i and i < 31:
        date = "2018-04-"+str(i)
        filename = Path + date +" shopee_product.csv"
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    readRDD = sc.textFile(filename).map(makeRDD).map(lambda line: tuple(line)+(date,))
    productRDD = productRDD.union(readRDD)

In [8]:
productRDD = productRDD.filter(lambda line: line[0]!="item_id")
productRDD.take(3)

[(290925436,
  2022130,
  'yiting1211',
  'iPhone保護殼',
  '可超取iphone6__iphone6S_47吋_迪士尼_米奇放大鏡_軟殼_手機殼',
  0,
  9.0,
  100,
  0,
  0.0,
  0,
  0,
  0,
  'https://shopee.tw/%E3%80%90%E5%8F%AF%E8%B6%85%E5%8F%96%E3%80%91iphone6-iphone6S-(4.7%E5%90%8B)-%E8%BF%AA%E5%A3%AB%E5%B0%BC-%E7%B1%B3%E5%A5%87%E6%94%BE%E5%A4%A7%E9%8F%A1-%E8%BB%9F%E6%AE%BC-%E6%89%8B%E6%A9%9F%E6%AE%BC-i.2022130.290925436',
  '可超取iphone6__iphone6S_47吋_迪士尼_米奇放大鏡_軟殼_手機殼售價150元',
  datetime.datetime(2018, 4, 1, 0, 0)),
 (1008696376,
  59323996,
  'kaixina',
  'Android保護殼',
  '潮流女神欧美爆款日韩名媛潮英伦学院风摇滚庞克仿旧百搭厚底坡跟涼鞋夏2017韓版新款羅馬鞋超高跟防水臺鏤空甜美露趾女鞋',
  '#女鞋#歐美#羅馬風格#簡約#小白鞋#英倫#韓版#休閒鞋#粗跟#綁帶#高跟鞋#復古#運動鞋#洞洞鞋#平底鞋#帆布鞋#拖鞋#涼鞋',
  4.3,
  800,
  0,
  0.0,
  0,
  120,
  -1,
  'https://shopee.tw/%E6%BD%AE%E6%B5%81%E5%A5%B3%E7%A5%9E%E6%AC%A7%E7%BE%8E%E7%88%86%E6%AC%BE%E6%97%A5%E9%9F%A9%E5%90%8D%E5%AA%9B%E6%BD%AE%E8%8B%B1%E4%BC%A6%E5%AD%A6%E9%99%A2%E9%A3%8E%E6%91%87%E6%BB%9A%E5%BA%9E%E5%85%8B%E4%BB%BF%E6%97%A7%E7%99%BE%E6%90%AD%E5%8E%9A%E5%BA%95%E5%9D%A1%E8%

In [9]:
productDF = spark.createDataFrame(productRDD, productSchema)
productDF.show(3)

+----------+--------+----------+-------------+--------------------+--------------------+--------+-----+----+---------+---------+-----+--------+--------------------+--------------------+-----------+
|   item_id| shop_id| shop_name|category_name|                name|             hashtag|trade_in|price|sale|score_avg|score_num|stock|need_day|                 url|             content|update_time|
+----------+--------+----------+-------------+--------------------+--------------------+--------+-----+----+---------+---------+-----+--------+--------------------+--------------------+-----------+
| 290925436| 2022130|yiting1211|    iPhone保護殼|可超取iphone6__iphon...|                   0|     9.0|  100|   0|      0.0|        0|    0|       0|https://shopee.tw...|可超取iphone6__iphon...| 2018-04-01|
|1008696376|59323996|   kaixina|   Android保護殼|潮流女神欧美爆款日韩名媛潮英伦学院...|#女鞋#歐美#羅馬風格#簡約#小白...|     4.3|  800|   0|      0.0|        0|  120|      -1|https://shopee.tw...|                   _| 2018-04-01|
| 24493544

In [10]:
productDF.count()

169992

In [11]:
dataRDD = productDF.select("item_id", "category_name", "name", "price", "sale", "score_avg", "score_num", "stock", "update_time")

In [12]:
dataRDD.rdd.take(3)

[Row(item_id=290925436, category_name='iPhone保護殼', name='可超取iphone6__iphone6S_47吋_迪士尼_米奇放大鏡_軟殼_手機殼', price=100, sale=0, score_avg=0.0, score_num=0, stock=0, update_time=datetime.date(2018, 4, 1)),
 Row(item_id=1008696376, category_name='Android保護殼', name='潮流女神欧美爆款日韩名媛潮英伦学院风摇滚庞克仿旧百搭厚底坡跟涼鞋夏2017韓版新款羅馬鞋超高跟防水臺鏤空甜美露趾女鞋', price=800, sale=0, score_avg=0.0, score_num=0, stock=120, update_time=datetime.date(2018, 4, 1)),
 Row(item_id=244935440, category_name='iPhone充電傳輸', name='微型商店Jetart_Lightning_USB_強化傳輸線15m_IPHONE_充電線_CAA220', price=499, sale=0, score_avg=0.0, score_num=0, stock=99, update_time=datetime.date(2018, 4, 1))]

# 依據id找出id中最大月銷量與最小月銷量

In [13]:
itemSaleDataRDD = dataRDD.rdd.map(lambda x: (x[0], x[4]))
itemSaleDataRDD.count()

169992

In [14]:
itemSaleDataRDD.take(3)

[(290925436, 0), (1008696376, 0), (244935440, 0)]

In [15]:
itemMaxSaleDataRDD = itemSaleDataRDD.reduceByKey(max)
itemMaxSaleDataRDD.count()

70408

In [16]:
itemMinSaleDataRDD = itemSaleDataRDD.reduceByKey(min)
itemMinSaleDataRDD.count()

70408

In [17]:
itemMonSaleDataRDD = itemMaxSaleDataRDD.join(itemMinSaleDataRDD)
itemMonSaleDataRDD.count()

70408

In [18]:
itemMonSaleDataRDD.take(3)

[(469012152, (0, 0)), (545299512, (0, 0)), (862832544, (0, 0))]

In [None]:
itemCatDataRDD = dataRDD.select("item_id", "name").rdd.distinct()
itemCatDataRDD.count()

In [13]:
itemNameDataRDD = dataRDD.select("item_id", "category_name").distinct().rdd
itemNameDataRDD.count()

70412

In [16]:
itemNameDataRDD.groupByKey().map(lambda x: (x[0], list(x[1]))).filter(lambda x: len(x[1])>1).collect()

[(975556617, ['掌上型電玩', '行動電源']),
 (1007626624, ['行動電源', '汽車百貨']),
 (1018676515, ['影音設備', '耳機喇叭']),
 (29332776, ['平板電腦', '其他'])]