Import Libraries

In [75]:
import pandas as pd
import numpy as np
# !pip install mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


Import Data

In [76]:
df = pd.read_csv('Market_Basket_Analysis/Groceries.csv')
df.head()


Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,1064,7/11/2023,apples,2023,7,11,1
1,1064,7/11/2023,butter,2023,7,11,1
2,1064,7/11/2023,cereal,2023,7,11,1
3,1028,1/15/2023,spinach,2023,1,15,6
4,1028,1/15/2023,butter,2023,1,15,6


Clean Data

In [77]:
# No missing data
df.isnull().sum()


Member_number      0
Date               0
itemDescription    0
year               0
month              0
day                0
day_of_week        0
dtype: int64

Transform data

In [79]:
# Create Primary key for transactions
transactions = (
    df.groupby(['Member_number', 'Date'])['itemDescription']
    .apply(list)
    .tolist()
)

# One-hot encode the transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)


Perform Market Basket Analysis

In [80]:
# Frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Association rule thresholds
rules = rules[
    (rules['lift'] > 1.1) &                        # stronger chance than random
    (rules['confidence'] >= 0.3) &                 # At least 30%+ chance of follow-up purchase (consequent)
    (rules['support'] >= 0.01)                     # seen in at least 1% of all transactions
]

# Remove overly generic items (e.g., "milk", "bread") 
generic_items = ['milk', 'whole milk', 'bread', 'soda']
rules = rules[~rules['consequents'].astype(str).str.contains('|'.join(generic_items))]

# Show all rows (no truncation)
pd.set_option('display.max_rows', None)

# Display top 25 rules ordered by lift
top_25 = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(25)
print(top_25[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False))


            antecedents consequents   support  confidence      lift
67      (bread, cereal)       (jam)  0.025208    0.363077  1.230676
29    (apples, spinach)   (chicken)  0.024354    0.331395  1.223392
23      (apples, pasta)    (cheese)  0.024995    0.334286  1.220586
108     (butter, pasta)   (spinach)  0.025636    0.336134  1.217837
49      (butter, beans)   (oranges)  0.025636    0.338983  1.211282
116       (cheese, jam)    (cereal)  0.025422    0.319892  1.188426
62        (rice, beans)   (oranges)  0.024995    0.332386  1.187710
178       (jam, yogurt)   (oranges)  0.026704    0.328084  1.172337
165        (rice, eggs)   (oranges)  0.022431    0.327103  1.168831
57         (beans, jam)   (oranges)  0.026063    0.326203  1.165616
109   (butter, spinach)     (pasta)  0.025636    0.326975  1.161284
141     (pasta, yogurt)    (cheese)  0.024995    0.316216  1.154609
140    (cheese, yogurt)     (pasta)  0.024995    0.325000  1.154268
47    (oranges, butter)     (beans)  0.025636   

Refer to the report to view the analysis tied to this notebook