In [1]:
import numpy as np
import pandas as pd 
from apyori import apriori

In [2]:
data = pd.read_csv('Datasets/Groceries_dataset.csv')
data.head(5)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [3]:
grouped_data = data.groupby(["Member_number", "Date"])["itemDescription"].apply(list)
transactions_df = grouped_data.reset_index()
transactions_df.head(5)

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"


In [4]:
transactions_df.drop(["Member_number", "Date"], axis=1, inplace=True)
transactions_df.head(5)

Unnamed: 0,itemDescription
0,"[sausage, whole milk, semi-finished bread, yog..."
1,"[whole milk, pastry, salty snack]"
2,"[canned beer, misc. beverages]"
3,"[sausage, hygiene articles]"
4,"[soda, pickled vegetables]"


In [5]:
transactions_list = transactions_df["itemDescription"].tolist()
transactions_list[444]

['root vegetables', 'frozen vegetables']

In [6]:
min_support_values = [0.001, 0.002, 0.003]
min_confidence_values = [0.1, 0.2, 0.3]
min_lift_values = [1.5, 2.0, 2.5]
max_len_values = [2, 3]
ordering_values = ['support', 'confidence', 'lift']

In [7]:
processed_rules = set()

results_df = pd.DataFrame(columns=['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])

for min_support in min_support_values:
    for min_confidence in min_confidence_values:
        for min_lift in min_lift_values:
            for max_len in max_len_values:
                for ordering in ordering_values:
                    
                    frequent_itemsets = apriori(
                        transactions_list,
                        min_support=min_support,
                        min_confidence=min_confidence,
                        min_lift=min_lift,
                        max_len=max_len,
                        ordering=ordering,
                        use_colnames=True
                    )
                    
                    
                    for relation in frequent_itemsets:
                        items = relation.items
                        support = relation.support
                        ordered_stats = relation.ordered_statistics[0]
                        antecedent = tuple(ordered_stats.items_base)
                        consequent = tuple(ordered_stats.items_add)
                        confidence = ordered_stats.confidence
                        lift = ordered_stats.lift

                        
                        rule_repr = (items, antecedent, consequent)

                        
                        if rule_repr not in processed_rules:
                            
                            processed_rules.add(rule_repr)

                            
                            results_df = pd.concat([results_df, pd.DataFrame([[antecedent, consequent, support, confidence, lift]],
                                                                             columns=['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])])


results_df = results_df.reset_index(drop=True)
results_df.nlargest(n = 10, columns = 'Lift')

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
3,"(whole milk, yogurt)","(sausage,)",0.00147,0.131737,2.182917
2,"(whole milk, sausage)","(yogurt,)",0.00147,0.164179,1.91176
4,"(sausage, yogurt)","(whole milk,)",0.00147,0.255814,1.619866
0,"(flour,)","(tropical fruit,)",0.001069,0.109589,1.617141
1,"(processed cheese,)","(root vegetables,)",0.001069,0.105263,1.513019
