In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
spark = SparkSession.builder.appName('bakery').getOrCreate()

In [4]:
from pyspark.ml.fpm import FPGrowth

In [5]:
df = spark.read.csv('75000i.csv', header= False, inferSchema=True)
df.show(5)

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|  1|  1| 21|
|  1|  5| 11|
|  2|  1|  7|
|  2|  3| 11|
|  2|  4| 37|
+---+---+---+
only showing top 5 rows



In [6]:
df = df.drop(*['_c1'])
df.show(5)

+---+---+
|_c0|_c2|
+---+---+
|  1| 21|
|  1| 11|
|  2|  7|
|  2| 11|
|  2| 37|
+---+---+
only showing top 5 rows



In [7]:
 from pyspark.sql.functions import col
 df = df.select(col("_c0").alias("order_id"), col("_c2").alias("product_id"))
 df.show(5)

+--------+----------+
|order_id|product_id|
+--------+----------+
|       1|        21|
|       1|        11|
|       2|         7|
|       2|        11|
|       2|        37|
+--------+----------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import collect_list, col, count, collect_set

In [9]:
df.createOrReplaceTempView('order_product')

In [10]:
products = spark.sql('select distinct product_id from order_product')
products.count()

50

In [11]:
rawData = spark.sql('select * from order_product')
baskets = rawData.groupBy('order_id').agg(collect_set('product_id').alias('items'))
baskets.createOrReplaceTempView('baskets')
baskets.show(5, truncate=False)

+--------+----------------------+
|order_id|items                 |
+--------+----------------------+
|148     |[33, 27, 9, 46, 28, 4]|
|463     |[17, 14]              |
|471     |[9, 37, 34, 20]       |
|496     |[15, 6, 47, 26]       |
|833     |[12, 5, 21]           |
+--------+----------------------+
only showing top 5 rows



In [12]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.003, minConfidence=0.003)
model = fpGrowth.fit(baskets)

In [13]:
model.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|     [7]|8193|
|    [45]|7700|
| [45, 7]|2367|
|    [28]|7556|
|[28, 45]| 387|
| [28, 7]| 383|
|    [18]|6987|
|[18, 28]| 393|
|[18, 45]| 318|
| [18, 7]| 321|
|     [4]|6948|
| [4, 28]| 465|
| [4, 45]| 372|
| [4, 18]| 402|
|  [4, 7]| 378|
|    [35]|6943|
|[35, 28]| 381|
| [35, 4]| 388|
|[35, 45]| 309|
|[35, 18]|3982|
+--------+----+
only showing top 20 rows



In [14]:
predict_id = model.transform(baskets)
predict_id.show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|     148|[33, 27, 9, 46, 2...|[35, 22, 45, 18, ...|
|     463|            [17, 14]|[27, 28, 35, 4, 2...|
|     471|     [9, 37, 34, 20]|[19, 33, 27, 17, ...|
|     496|     [15, 6, 47, 26]|[27, 33, 1, 28, 3...|
|     833|         [12, 5, 21]|[19, 33, 27, 17, ...|
|    1088| [27, 35, 3, 18, 40]|[28, 4, 22, 45, 4...|
|    1238|        [19, 32, 18]|[28, 45, 7, 4, 35...|
|    1342|         [49, 17, 8]|[19, 27, 33, 1, 2...|
|    1580|        [12, 31, 36]|[48, 19, 33, 27, ...|
|    1591|             [1, 19]|[27, 33, 28, 37, ...|
|    1645|         [15, 49, 7]|[27, 33, 1, 28, 3...|
|    1829|[15, 49, 38, 6, 7...|[27, 33, 1, 28, 3...|
|    1959|[9, 1, 18, 4, 22,...|[28, 45, 7, 35, 4...|
|    2122|             [5, 22]|[27, 1, 28, 35, 1...|
|    2142|        [14, 44, 41]|[27, 28, 35, 4, 2...|
|    2366|         [0, 27, 29]|[28, 35, 4, 22,

In [15]:
product_data = spark.read.csv('goods.csv', header=True, inferSchema=True)
product_data.show(5, truncate=False)

+---+------------+------+-----+------+
|Id |Flavor      |Food  |Price|Type  |
+---+------------+------+-----+------+
|0  |'Chocolate' |'Cake'|8.95 |'Food'|
|1  |'Lemon'     |'Cake'|8.95 |'Food'|
|2  |'Casino'    |'Cake'|15.95|'Food'|
|3  |'Opera'     |'Cake'|15.95|'Food'|
|4  |'Strawberry'|'Cake'|11.95|'Food'|
+---+------------+------+-----+------+
only showing top 5 rows



In [16]:
product_data.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Flavor: string (nullable = true)
 |-- Food: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Type: string (nullable = true)



In [17]:
import pyspark.sql.functions as f
product_data = product_data.withColumn("Flavor", f.split(product_data['Flavor'], "\'")[1])
product_data = product_data.withColumn("Food", f.split(product_data['Food'], "\'")[1])
product_data.show(5, truncate=False)

+---+----------+----+-----+------+
|Id |Flavor    |Food|Price|Type  |
+---+----------+----+-----+------+
|0  |Chocolate |Cake|8.95 |'Food'|
|1  |Lemon     |Cake|8.95 |'Food'|
|2  |Casino    |Cake|15.95|'Food'|
|3  |Opera     |Cake|15.95|'Food'|
|4  |Strawberry|Cake|11.95|'Food'|
+---+----------+----+-----+------+
only showing top 5 rows



In [18]:
from pyspark.sql.functions import concat_ws
product_data = product_data.withColumn('product_name', concat_ws(" ",product_data.Flavor,product_data.Food))
product_data.show(5, truncate=False)

+---+----------+----+-----+------+---------------+
|Id |Flavor    |Food|Price|Type  |product_name   |
+---+----------+----+-----+------+---------------+
|0  |Chocolate |Cake|8.95 |'Food'|Chocolate Cake |
|1  |Lemon     |Cake|8.95 |'Food'|Lemon Cake     |
|2  |Casino    |Cake|15.95|'Food'|Casino Cake    |
|3  |Opera     |Cake|15.95|'Food'|Opera Cake     |
|4  |Strawberry|Cake|11.95|'Food'|Strawberry Cake|
+---+----------+----+-----+------+---------------+
only showing top 5 rows



In [19]:
product_data.createOrReplaceTempView('products')

In [20]:
rawData1 = spark.sql('select p.product_name, o.order_id from products p inner join order_product o where o.product_id = p.Id')

In [21]:
baskets1 = rawData1.groupBy('order_id').agg(collect_set('product_name').alias('items'))
baskets1.createOrReplaceTempView('baskets')
baskets1.head(3)

[Row(order_id=148, items=['Tuile Cookie', 'Strawberry Cake', 'Napoleon Cake', 'Chocolate Coffee', 'Cheese Croissant', 'Marzipan Cookie']),
 Row(order_id=463, items=['Berry Tart', 'Chocolate Tart']),
 Row(order_id=471, items=['Almond Twist', 'Chocolate Croissant', 'Pecan Tart', 'Napoleon Cake'])]

In [22]:
fpGrowth1 = FPGrowth(itemsCol='items', minSupport=0.003, minConfidence=0.003)
model1 = fpGrowth1.fit(baskets1)

In [23]:
model1.freqItemsets.show(truncate=False)

+---------------------------------+----+
|items                            |freq|
+---------------------------------+----+
|[Coffee Eclair]                  |8193|
|[Hot Coffee]                     |7700|
|[Hot Coffee, Coffee Eclair]      |2367|
|[Tuile Cookie]                   |7556|
|[Tuile Cookie, Hot Coffee]       |387 |
|[Tuile Cookie, Coffee Eclair]    |383 |
|[Cherry Tart]                    |6987|
|[Cherry Tart, Tuile Cookie]      |393 |
|[Cherry Tart, Hot Coffee]        |318 |
|[Cherry Tart, Coffee Eclair]     |321 |
|[Strawberry Cake]                |6948|
|[Strawberry Cake, Tuile Cookie]  |465 |
|[Strawberry Cake, Hot Coffee]    |372 |
|[Strawberry Cake, Cherry Tart]   |402 |
|[Strawberry Cake, Coffee Eclair] |378 |
|[Apricot Danish]                 |6943|
|[Apricot Danish, Tuile Cookie]   |381 |
|[Apricot Danish, Strawberry Cake]|388 |
|[Apricot Danish, Hot Coffee]     |309 |
|[Apricot Danish, Cherry Tart]    |3982|
+---------------------------------+----+
only showing top

In [24]:
predict_id1 = model1.transform(baskets1)
predict_id1.show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|     148|[Tuile Cookie, St...|[Hot Coffee, Cher...|
|     463|[Berry Tart, Choc...|[Lemon Tart, Marz...|
|     471|[Almond Twist, Ch...|[Marzipan Cookie,...|
|     496|[Vanilla Meringue...|[Lemon Tart, Marz...|
|     833|[Ganache Cookie, ...|[Lemon Tart, Chee...|
|    1088|[Cherry Tart, Ope...|[Cheese Croissant...|
|    1238|[Cherry Tart, Apr...|[Marzipan Cookie,...|
|    1342|[Single Espresso,...|[Lemon Tart, Marz...|
|    1580|[Apple Croissant,...|[Lemon Tart, Chee...|
|    1591|[Lemon Cake, Lemo...|[Marzipan Cookie,...|
|    1645|[Coffee Eclair, B...|[Hot Coffee, Tuil...|
|    1829|[Coffee Eclair, A...|[Lemon Tart, Rasp...|
|    1959|[Cherry Tart, Lem...|[Tuile Cookie, Ho...|
|    2122|[Truffle Cake, Go...|[Tuile Cookie, Ap...|
|    2142|[Bottled Water, B...|[Lemon Tart, Chee...|
|    2366|[Walnut Cookie, C...|[Tuile Cookie, 