# CHAPTER 9 - EXERCISE 5

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

## Nhập dữ liệu

In [2]:
ss= SparkSession.builder.appName('Chapter 9 - Exercise 5').getOrCreate()

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter9/LDS9_Data_Day_7/instacart_2017_05_01/order_products__train.csv'
df= ss.read.csv(path, header= True, inferSchema= True)

In [4]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)



In [5]:
df.show(10)

+--------+----------+-----------------+---------+
|order_id|product_id|add_to_cart_order|reordered|
+--------+----------+-----------------+---------+
|       1|     49302|                1|        1|
|       1|     11109|                2|        1|
|       1|     10246|                3|        0|
|       1|     49683|                4|        0|
|       1|     43633|                5|        1|
|       1|     13176|                6|        0|
|       1|     47209|                7|        0|
|       1|     22035|                8|        1|
|      36|     39612|                1|        0|
|      36|     19660|                2|        1|
+--------+----------+-----------------+---------+
only showing top 10 rows



## Xử lý dữ liệu

In [11]:
from pyspark.sql.functions import collect_set
df_final= df.groupBy('order_id').agg(collect_set('product_id').alias('product_id'))

In [12]:
df_final.show(5)

+--------+--------------------+
|order_id|          product_id|
+--------+--------------------+
|    1342|[30827, 3798, 149...|
|    1591|[48246, 44116, 24...|
|    4519|             [29270]|
|    4935|             [45190]|
|    6357|[33731, 14669, 43...|
+--------+--------------------+
only showing top 5 rows



## Tạo mô hình

In [13]:
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="product_id", minSupport=0.003, minConfidence=0.003)
model = fpGrowth.fit(df_final)

In [33]:
freq_item_sets= model.freqItemsets
freq_item_sets.show()

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|             [13629]|  772|
|              [5194]|  475|
|             [24852]|18726|
|             [13176]|15480|
|             [35921]|  769|
|             [20345]|  473|
|             [21137]|10894|
|      [21137, 13176]| 3074|
|      [21137, 24852]| 2174|
|             [23165]|  764|
|             [13380]|  473|
|              [7969]|  472|
|             [21903]| 9784|
|      [21903, 21137]| 1639|
|[21903, 21137, 13...|  587|
|      [21903, 13176]| 2236|
|      [21903, 24852]| 2000|
|             [32478]|  763|
|             [47626]| 8135|
|      [47626, 21137]| 1017|
+--------------------+-----+
only showing top 20 rows



In [29]:
prediction_items_set= model.transform(df_final)
prediction_items_set.show()

+--------+--------------------+--------------------+
|order_id|          product_id|          prediction|
+--------+--------------------+--------------------+
|    1342|[30827, 3798, 149...|[21903, 47626, 47...|
|    1591|[48246, 44116, 24...|[21137, 21903, 47...|
|    4519|             [29270]|                  []|
|    4935|             [45190]|                  []|
|    6357|[33731, 14669, 43...|[21137, 21903, 47...|
|   10362|[28522, 43789, 12...|[21137, 47626, 47...|
|   19204|[45255, 37285, 48...|                  []|
|   29601|[2716, 48057, 219...|[21137, 21903, 47...|
|   31035|[40723, 8174, 131...|[21137, 21903, 47...|
|   40011|[27292, 35213, 21...|[21137, 13176, 24...|
|   46266|[38558, 48642, 13...|[47626, 47766, 47...|
|   51607|[41390, 42752, 17...|                  []|
|   58797|[30827, 8803, 326...|[21137, 21903, 47...|
|   61793|[26348, 6184, 433...|[21137, 16797, 39...|
|   67089|[47766, 29388, 21...|[47626, 21137, 47...|
|   70863|[34791, 2618, 173...|      [13176, 2

## Sử dụng tên product thay thế cho product id

Thay thế id bằng name ở bảng data nguồn

In [16]:
path_product_names= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter9/LDS9_Data_Day_7/instacart_2017_05_01/products.csv'
product_names= ss.read.csv(path_product_names, header= True, inferSchema= True)

In [17]:
product_names.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)



In [18]:
product_names.show(5)

+----------+--------------------+--------+-------------+
|product_id|        product_name|aisle_id|department_id|
+----------+--------------------+--------+-------------+
|         1|Chocolate Sandwic...|      61|           19|
|         2|    All-Seasons Salt|     104|           13|
|         3|Robust Golden Uns...|      94|            7|
|         4|Smart Ones Classi...|      38|            1|
|         5|Green Chile Anyti...|       5|           13|
+----------+--------------------+--------+-------------+
only showing top 5 rows



In [19]:
df_new= df.join(other= product_names.select('product_id', 'product_name'),
                how= 'left', on= 'product_id')

In [20]:
df_new.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- product_name: string (nullable = true)



In [21]:
df_new.show(5)

+----------+--------+-----------------+---------+--------------------+
|product_id|order_id|add_to_cart_order|reordered|        product_name|
+----------+--------+-----------------+---------+--------------------+
|     49302|       1|                1|        1|    Bulgarian Yogurt|
|     11109|       1|                2|        1|Organic 4% Milk F...|
|     10246|       1|                3|        0|Organic Celery He...|
|     49683|       1|                4|        0|      Cucumber Kirby|
|     43633|       1|                5|        1|Lightly Smoked Sa...|
+----------+--------+-----------------+---------+--------------------+
only showing top 5 rows



In [22]:
df_new_final= df_new.groupBy('order_id').agg(collect_set('product_name').alias('product_name'))

In [24]:
# Tạo mô hình
fpGrowth_new = FPGrowth(itemsCol="product_name", minSupport=0.003, minConfidence=0.003)
model_new = fpGrowth_new.fit(df_new_final)

In [32]:
freq_item_sets_by_name= model_new.freqItemsets
freq_item_sets_by_name.show()

+--------------------+-----+
|               items| freq|
+--------------------+-----+
|[Organic Tomato B...|  772|
|[Organic Spinach ...|  475|
|            [Banana]|18726|
|[Bag of Organic B...|15480|
|[Organic Large Gr...|  769|
|[Organic Blue Cor...|  473|
|[Organic Strawber...|10894|
|[Organic Strawber...| 3074|
|[Organic Strawber...| 2174|
|      [Organic Leek]|  764|
|[Thin Crust Peppe...|  473|
|              [Lime]|  472|
|[Organic Baby Spi...| 9784|
|[Organic Baby Spi...| 1639|
|[Organic Baby Spi...|  587|
|[Organic Baby Spi...| 2236|
|[Organic Baby Spi...| 2000|
|[Reduced Fat 2% M...|  763|
|       [Large Lemon]| 8135|
|[Large Lemon, Org...| 1017|
+--------------------+-----+
only showing top 20 rows



In [47]:
freq_item_sets_by_name.printSchema()

root
 |-- items: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- freq: long (nullable = false)



In [27]:
prediction_items_set_by_name= model_new.transform(df_new_final)
prediction_items_set_by_name.show()

+--------+--------------------+--------------------+
|order_id|        product_name|          prediction|
+--------+--------------------+--------------------+
|    1342|[Raw Shrimp, Seed...|[Organic Baby Spi...|
|    1591|[Cracked Wheat, S...|[Organic Strawber...|
|    4519|[Beet Apple Carro...|                  []|
|    4935|             [Vodka]|                  []|
|    6357|[Globe Eggplant, ...|[Organic Strawber...|
|   10362|[Organic Baby Spi...|[Organic Strawber...|
|   19204|[Reduced Fat Crac...|                  []|
|   29601|[Organic Red Onio...|[Organic Strawber...|
|   31035|[Organic Cripps P...|[Organic Strawber...|
|   40011|[Organic Baby Spi...|[Organic Strawber...|
|   46266|[Uncured Beef Hot...|[Large Lemon, Org...|
|   51607|[Donut House Choc...|                  []|
|   58797|[Concentrated But...|[Organic Strawber...|
|   61793|[Raspberries, Gre...|[Organic Strawber...|
|   67089|[Original Tofurky...|[Organic Strawber...|
|   70863|[Extra Hold Non-A...|[Bag of Organic

In [46]:
prediction_items_set_by_name.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: array (nullable = true)
 |    |-- element: string (containsNull = true)

