In [10]:
import numpy as np
import pandas as pd 
from apyori import apriori

In [11]:
data = pd.read_csv('Datasets/Groceries_dataset.csv')
data.head(5)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [12]:
grouped_data = data.groupby(["Member_number", "Date"])["itemDescription"].apply(list)
transactions_df = grouped_data.reset_index()
transactions_df.head(5)

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"


In [13]:
transactions_df.drop(["Member_number", "Date"], axis=1, inplace=True)
transactions_df.head(5)

Unnamed: 0,itemDescription
0,"[sausage, whole milk, semi-finished bread, yog..."
1,"[whole milk, pastry, salty snack]"
2,"[canned beer, misc. beverages]"
3,"[sausage, hygiene articles]"
4,"[soda, pickled vegetables]"


In [14]:
transactions_list = transactions_df["itemDescription"].tolist()
transactions_list[444]

['root vegetables', 'frozen vegetables']

In [15]:
min_support_values = [0.001, 0.002, 0.003]
min_confidence_values = [0.1, 0.2, 0.3]
min_lift_values = [1.5, 2.0, 2.5]
max_len_values = [2, 3]
ordering_values = ['support', 'confidence', 'lift']
max_length_values = [2, 3]

In [16]:

results_df = pd.DataFrame(columns=["Product 1", "Product 2", "Support"])


processed_pairs = set()


for min_support in min_support_values:
    for max_length in max_length_values:
        
        frequent_itemsets = apriori(
            transactions_list,
            min_support=min_support,
            max_length=max_length,
            use_colnames=True
        )
        
        for relation in frequent_itemsets:
            items = list(relation.items)  
            if len(items) > 1:
                product_1 = items[0]
                product_2 = items[1]
                support = relation.support
                pair_repr = tuple(sorted([product_1, product_2]))  

                
                if pair_repr not in processed_pairs:
                    processed_pairs.add(pair_repr)

                    new_row = pd.DataFrame({"Product 1": [product_1], "Product 2": [product_2], "Support": [support]})
                    results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.nlargest(n = 10, columns = 'Support')

Unnamed: 0,Product 1,Product 2,Support
476,other vegetables,whole milk,0.014837
528,rolls/buns,whole milk,0.013968
568,whole milk,soda,0.011629
591,yogurt,whole milk,0.011161
460,other vegetables,rolls/buns,0.010559
467,other vegetables,soda,0.009691
552,sausage,whole milk,0.008955
581,tropical fruit,whole milk,0.00822
477,other vegetables,yogurt,0.008087
520,rolls/buns,soda,0.008087
