## Dataset
https://www.kaggle.com/carrie1/ecommerce-data/home

## Load Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("try fp-growth").getOrCreate()

In [4]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001F0BCD858D0>


## Load Dataset

In [5]:
df = spark.read.csv("F:/Wahyu Ivan Satyagraha/BigData/tugas frequent itemset/data.csv", header=True, inferSchema=True)

In [6]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [7]:
#create backup for later purpose
df_backup = df.selectExpr(['InvoiceNo as ID','StockCode as items', 'Description'])

In [8]:
#select only 2 column
df = df.selectExpr(['InvoiceNo as ID','StockCode as items'])

In [9]:
df.show()

+------+------+
|    ID| items|
+------+------+
|536365|85123A|
|536365| 71053|
|536365|84406B|
|536365|84029G|
|536365|84029E|
|536365| 22752|
|536365| 21730|
|536366| 22633|
|536366| 22632|
|536367| 84879|
|536367| 22745|
|536367| 22748|
|536367| 22749|
|536367| 22310|
|536367| 84969|
|536367| 22623|
|536367| 22622|
|536367| 21754|
|536367| 21755|
|536367| 21777|
+------+------+
only showing top 20 rows



In [10]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- items: string (nullable = true)



In [11]:
from pyspark.sql.functions import collect_list

In [12]:
#aggregate into 1 column
df2 = df.groupby("ID").agg(collect_list('items').alias('items'))

In [13]:
df2.show()

+------+--------------------+
|    ID|               items|
+------+--------------------+
|536596|[21624, 22900, 22...|
|536938|[22386, 85099C, 2...|
|537252|             [22197]|
|537691|[22791, 22171, 82...|
|538041|             [22145]|
|538184|[22585, 21481, 22...|
|538517|[22491, 21232, 21...|
|538879|[84819, 22150, 21...|
|539275|[22909, 22423, 22...|
|539630|[21484, 85099B, 2...|
|540499|[21868, 22697, 22...|
|540540|[21877, 21868, 21...|
|540976|[22394, 21890, 22...|
|541432|[21485, 22457, 84...|
|541518|[21880, 21881, 21...|
|541783|[22423, 22854, 22...|
|542026|[21754, 82600, 22...|
|542375|[21731, 22367, 22...|
|543641|[85123A, 21833, 2...|
|544303|[22660, 48138, 48...|
+------+--------------------+
only showing top 20 rows



In [14]:
from pyspark.sql.types import ArrayType, StringType

In [15]:
from pyspark.sql.functions import udf

In [16]:
#remove a duplicate
remove_duplicate = udf(lambda row: list(set(row)), ArrayType(StringType()))
df2 = df2.withColumn("without_Duplicate", remove_duplicate("items"))

In [17]:
df2.show()

+------+--------------------+--------------------+
|    ID|               items|   without_Duplicate|
+------+--------------------+--------------------+
|536596|[21624, 22900, 22...|[84926A, 21624, 2...|
|536938|[22386, 85099C, 2...|[21479, 84997B, 2...|
|537252|             [22197]|             [22197]|
|537691|[22791, 22171, 82...|[20975, 22149, 21...|
|538041|             [22145]|             [22145]|
|538184|[22585, 21481, 22...|[22492, 22561, 48...|
|538517|[22491, 21232, 21...|[22197, 22844, 22...|
|538879|[84819, 22150, 21...|[22593, 22983, 22...|
|539275|[22909, 22423, 22...|[22423, 21914, 22...|
|539630|[21484, 85099B, 2...|[22988, 84347, 22...|
|540499|[21868, 22697, 22...|[21755, 84978, 22...|
|540540|[21877, 21868, 21...|[22555, 22551, 22...|
|540976|[22394, 21890, 22...|[22207, 21110, 84...|
|541432|[21485, 22457, 84...|[22113, 22457, 21...|
|541518|[21880, 21881, 21...|[20724, 21982, 20...|
|541783|[22423, 22854, 22...|[22197, 84978, 22...|
|542026|[21754, 82600, 22...|[2

In [18]:
df2.printSchema()

root
 |-- ID: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- without_Duplicate: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
df2=df2.selectExpr(['ID','without_Duplicate as items'])

In [20]:
df2.show()

+------+--------------------+
|    ID|               items|
+------+--------------------+
|536596|[84926A, 21624, 2...|
|536938|[21479, 84997B, 2...|
|537252|             [22197]|
|537691|[20975, 22149, 21...|
|538041|             [22145]|
|538184|[22492, 22561, 48...|
|538517|[22197, 22844, 22...|
|538879|[22593, 22983, 22...|
|539275|[22423, 21914, 22...|
|539630|[22988, 84347, 22...|
|540499|[21755, 84978, 22...|
|540540|[22555, 22551, 22...|
|540976|[22207, 21110, 84...|
|541432|[22113, 22457, 21...|
|541518|[20724, 21982, 20...|
|541783|[22197, 84978, 22...|
|542026|[22197, 22398, 22...|
|542375|[22367, 22629, 21...|
|543641|[22371, 44265, 21...|
|544303|[20856, 22197, 20...|
+------+--------------------+
only showing top 20 rows



## FP - Growth

In [21]:
from pyspark.ml.fpm import FPGrowth

In [22]:
#first model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.6)
model = fpGrowth.fit(df2)

In [23]:
model.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2246|
| [22423]|2172|
|[85099B]|2135|
| [47566]|1706|
| [20725]|1608|
| [84879]|1468|
| [22720]|1462|
| [22197]|1442|
| [21212]|1334|
| [22383]|1306|
| [20727]|1295|
+--------+----+



In [24]:
model.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [25]:
model.transform(df2).show()

+------+--------------------+----------+
|    ID|               items|prediction|
+------+--------------------+----------+
|536596|[84926A, 21624, 2...|        []|
|536938|[21479, 84997B, 2...|        []|
|537252|             [22197]|        []|
|537691|[20975, 22149, 21...|        []|
|538041|             [22145]|        []|
|538184|[22492, 22561, 48...|        []|
|538517|[22197, 22844, 22...|        []|
|538879|[22593, 22983, 22...|        []|
|539275|[22423, 21914, 22...|        []|
|539630|[22988, 84347, 22...|        []|
|540499|[21755, 84978, 22...|        []|
|540540|[22555, 22551, 22...|        []|
|540976|[22207, 21110, 84...|        []|
|541432|[22113, 22457, 21...|        []|
|541518|[20724, 21982, 20...|        []|
|541783|[22197, 84978, 22...|        []|
|542026|[22197, 22398, 22...|        []|
|542375|[22367, 22629, 21...|        []|
|543641|[22371, 44265, 21...|        []|
|544303|[20856, 22197, 20...|        []|
+------+--------------------+----------+
only showing top

In [26]:
#second model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.03, minConfidence=0.7)
model2 = fpGrowth.fit(df2)

In [27]:
model2.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2246|
|        [22423]|2172|
|       [85099B]|2135|
|        [47566]|1706|
|        [20725]|1608|
|        [84879]|1468|
|        [22720]|1462|
|        [22197]|1442|
|        [21212]|1334|
|        [22383]|1306|
|        [20727]|1295|
|        [22457]|1266|
|         [POST]|1254|
|        [23203]|1249|
|        [22386]|1231|
|[22386, 85099B]| 833|
|        [22960]|1220|
|        [22469]|1214|
|        [21931]|1201|
|        [22411]|1187|
+---------------+----+
only showing top 20 rows



In [28]:
model2.associationRules.show()

+----------+----------+------------------+
|antecedent|consequent|        confidence|
+----------+----------+------------------+
|   [22699]|   [22697]|               0.7|
|   [22697]|   [22699]|0.7417218543046358|
+----------+----------+------------------+



In [29]:
model2.transform(df2).show()

+------+--------------------+----------+
|    ID|               items|prediction|
+------+--------------------+----------+
|536596|[84926A, 21624, 2...|        []|
|536938|[21479, 84997B, 2...|        []|
|537252|             [22197]|        []|
|537691|[20975, 22149, 21...|        []|
|538041|             [22145]|        []|
|538184|[22492, 22561, 48...|        []|
|538517|[22197, 22844, 22...|        []|
|538879|[22593, 22983, 22...|        []|
|539275|[22423, 21914, 22...|        []|
|539630|[22988, 84347, 22...|        []|
|540499|[21755, 84978, 22...|        []|
|540540|[22555, 22551, 22...|        []|
|540976|[22207, 21110, 84...|        []|
|541432|[22113, 22457, 21...|        []|
|541518|[20724, 21982, 20...|        []|
|541783|[22197, 84978, 22...|        []|
|542026|[22197, 22398, 22...|        []|
|542375|[22367, 22629, 21...|        []|
|543641|[22371, 44265, 21...|        []|
|544303|[20856, 22197, 20...|        []|
+------+--------------------+----------+
only showing top

In [30]:
#third model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.6)
model3 = fpGrowth.fit(df2)

In [31]:
model3.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [32]:
model3.associationRules.show()

+--------------------+----------+------------------+
|          antecedent|consequent|        confidence|
+--------------------+----------+------------------+
|      [20726, 22382]|   [20725]|0.6356107660455487|
|             [22699]|   [22697]|               0.7|
|      [20723, 22355]|   [20724]|0.8038277511961722|
|      [20723, 22355]|   [20719]|0.7272727272727273|
|      [20723, 22355]|   [22356]|  0.65311004784689|
|             [22866]|   [22865]| 0.600358422939068|
|             [20723]|   [20724]| 0.667574931880109|
|      [22356, 20719]|   [22355]|0.7405541561712846|
|      [22356, 20719]|   [20724]|0.8211586901763224|
|      [22356, 20719]|   [20723]|0.6649874055415617|
|        [DOT, 22411]|  [85099B]|0.7713498622589532|
|             [22746]|   [22748]| 0.796969696969697|
|             [22746]|   [22745]| 0.793939393939394|
|      [20726, 22384]|   [20725]| 0.713216957605985|
|             [22386]|  [85099B]|0.6766856214459789|
|[21931, 22386, 85...|   [22411]|0.64663461538

In [33]:
model3.transform(df2).show()

+------+--------------------+--------------------+
|    ID|               items|          prediction|
+------+--------------------+--------------------+
|536596|[84926A, 21624, 2...|                  []|
|536938|[21479, 84997B, 2...|[85099B, 22355, 2...|
|537252|             [22197]|                  []|
|537691|[20975, 22149, 21...|                  []|
|538041|             [22145]|                  []|
|538184|[22492, 22561, 48...|                  []|
|538517|[22197, 22844, 22...|                  []|
|538879|[22593, 22983, 22...|                  []|
|539275|[22423, 21914, 22...|                  []|
|539630|[22988, 84347, 22...|                  []|
|540499|[21755, 84978, 22...|      [22698, 20724]|
|540540|[22555, 22551, 22...|                  []|
|540976|[22207, 21110, 84...|[22355, 22356, 20...|
|541432|[22113, 22457, 21...|                  []|
|541518|[20724, 21982, 20...|[21931, 22386, 20...|
|541783|[22197, 84978, 22...|             [22698]|
|542026|[22197, 22398, 22...|  

In [40]:
#select the items we want to see
df_final2=spark.createDataFrame([('0',['22698'])],['ID','items'])

In [41]:
df_final2.show()

+---+-------+
| ID|  items|
+---+-------+
|  0|[22698]|
+---+-------+



In [42]:
model3.transform(df_final2).show()

+---+-------+--------------+
| ID|  items|    prediction|
+---+-------+--------------+
|  0|[22698]|[22697, 22699]|
+---+-------+--------------+



In [44]:
#do a SQL query to get the description of the items
df_backup.createOrReplaceTempView("Final")

In [47]:
final=spark.sql("SELECT DISTINCT Description FROM Final WHERE items = '22698'")

In [48]:
final.show()

+--------------------+
|         Description|
+--------------------+
|                null|
|PINK REGENCY TEAC...|
+--------------------+



In [49]:
final2=spark.sql("SELECT DISTINCT Description FROM Final WHERE items = '22697'")

In [50]:
final2.show()

+--------------------+
|         Description|
+--------------------+
|GREEN REGENCY TEA...|
+--------------------+



In [51]:
final3=spark.sql("SELECT DISTINCT Description FROM Final WHERE items = '22699'")

In [52]:
final3.show()

+--------------------+
|         Description|
+--------------------+
|ROSES REGENCY TEA...|
+--------------------+



## Conclusion
from the transaction above, we can take a conclusion that if a customer buy a pink regency teacup and saucer, then she/he is likely to buy the same one but in different colour (green and roses)