In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
import random

spark = SparkSession.builder.appName("FP-GrowthSpark").getOrCreate()

items_list = ['Milk', 'Eggs', 'Bread', 'Diapers', 'Beer', 'Cola', 'Juice', 'Chips', 'Soda', 'Chicken']
num_transactions = 1000

data = [(i + 1, random.sample(items_list, random.randint(1, len(items_list)))) for i in range(num_transactions)]
df = spark.createDataFrame(data, ["TransactionID", "Items"])
df_items = df.rdd.map(lambda row: row.Items).map(lambda x: (x,)).toDF(["Items"])

fp_growth = FPGrowth(itemsCol="Items", minSupport=0.3, minConfidence=0.6)

model = fp_growth.fit(df_items)

association_rules = model.associationRules.toPandas()

sorted_rules = association_rules.sort_values(by="lift", ascending=False)

sorted_rules[['antecedent', 'consequent', 'support', 'confidence', 'lift']]

Unnamed: 0,antecedent,consequent,support,confidence,lift
48,"[Chicken, Eggs]",[Juice],0.300,0.797872,1.412163
95,"[Chicken, Juice]",[Eggs],0.300,0.802139,1.409735
19,"[Milk, Beer]",[Soda],0.300,0.783290,1.403745
58,"[Soda, Milk]",[Beer],0.300,0.791557,1.396044
47,"[Juice, Eggs]",[Chicken],0.300,0.761421,1.386924
...,...,...,...,...,...
44,[Cola],[Soda],0.359,0.648014,1.161316
43,[Cola],[Juice],0.363,0.655235,1.159707
24,[Juice],[Cola],0.363,0.642478,1.159707
63,[Chicken],[Beer],0.355,0.646630,1.140441
