In [7]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
groceries_df = pd.read_csv('Groceries.csv')
groceries_df

Unnamed: 0,Member_number,Date,itemDescription,year,month,day,day_of_week
0,1808,2015-07-21,tropical fruit,2015,7,21,1
1,2552,2015-05-01,whole milk,2015,5,1,4
2,2300,2015-09-19,pip fruit,2015,9,19,5
3,1187,2015-12-12,other vegetables,2015,12,12,5
4,3037,2015-01-02,whole milk,2015,1,2,4
...,...,...,...,...,...,...,...
38760,4471,2014-08-10,sliced cheese,2014,8,10,6
38761,2022,2014-02-23,candy,2014,2,23,6
38762,1097,2014-04-16,cake bar,2014,4,16,2
38763,1510,2014-03-12,fruit/vegetable juice,2014,3,12,2


In [8]:
groceries_df.isnull().sum()

Member_number      0
Date               0
itemDescription    0
year               0
month              0
day                0
day_of_week        0
dtype: int64

In [9]:
top_items = groceries_df['itemDescription'].value_counts().head(10)

print("Top 10 most frequently purchased items:")
print(top_items)

Top 10 most frequently purchased items:
itemDescription
whole milk          2502
other vegetables    1898
rolls/buns          1716
soda                1514
yogurt              1334
root vegetables     1071
tropical fruit      1032
bottled water        933
sausage              924
citrus fruit         812
Name: count, dtype: int64


In [10]:
# Create a function to get the top 5 most frequent items for a given month
def top_items_per_month(month):
    # Filter the dataset for the given month
    month_data = groceries_df[groceries_df['month'] == month]
    # Get the top 5 most frequent items
    top_items = month_data['itemDescription'].value_counts().head(5)
    return top_items

# Get the list of unique months in the dataset
unique_months = groceries_df['month'].unique()

# Dictionary to store the top items for each month
top_items_dict = {}

# Iterate through each unique month and get the top items
for month in unique_months:
    top_items_dict[month] = top_items_per_month(month)

# Print the top 5 most frequent items for each month
for month, items in top_items_dict.items():
    print(f"Month: {month}")
    print(items)
    print("\n")

Month: 7
itemDescription
whole milk          222
other vegetables    160
rolls/buns          135
soda                116
yogurt              111
Name: count, dtype: int64


Month: 5
itemDescription
whole milk          218
other vegetables    145
rolls/buns          142
soda                115
yogurt              103
Name: count, dtype: int64


Month: 9
itemDescription
whole milk          200
other vegetables    150
rolls/buns          147
soda                119
yogurt              113
Name: count, dtype: int64


Month: 12
itemDescription
whole milk          167
other vegetables    161
soda                130
rolls/buns          125
root vegetables     104
Name: count, dtype: int64


Month: 1
itemDescription
whole milk          219
other vegetables    166
rolls/buns          149
soda                134
yogurt              112
Name: count, dtype: int64


Month: 2
itemDescription
whole milk          173
rolls/buns          159
other vegetables    144
soda                107
yogurt       

In [22]:
# Categorize days into weekends(1) and weekdays(0)
groceries_df['is_weekend'] = groceries_df['day_of_week'].apply(lambda x: '1' if x >= 5 else '0')
groceries_df

import scipy.stats as stats

is_weekend = groceries_df[groceries_df['is_weekend'] == '1']['itemDescription'].count()
isnt_weekend = groceries_df[groceries_df['is_weekend']== '0']['itemDescription'].count()

t_statistic, p_value = stats.ttest_ind(a=is_weekend,b=isnt_weekend)

if p_value < 0.05:
  print("There is a significant difference in the purchase frequency of certain items on weekends versus weekdays.")
else:
  print("There is no significant difference in the purchase frequency of certain items on weekends versus weekdays.")

There is no significant difference in the purchase frequency of certain items on weekends versus weekdays.


  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df


In [27]:
# Create a basket format
basket = (groceries_df.groupby(['Member_number', 'itemDescription'])['itemDescription'].count().unstack().reset_index().fillna(0).set_index('Member_number'))
# basket.head(15)

def encode_units(x):
    if x <=0:
       return 0
    if x > 0:
       return 1
basket = basket.applymap(encode_units)
# basket.head(15)

# Step 2: Apply the Apriori algorithm
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)

# Step 3: Generate the association rules
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
rules

# Step 4: Filter to find the most common item pairs
common_item_pairs = rules[(rules['antecedents'].apply(lambda x: len(x) == 1)) & 
                          (rules['consequents'].apply(lambda x: len(x) == 1))]
common_item_pairs = common_item_pairs.sort_values(by='support', ascending=False).head(10)

print("Most common item pairs purchased together:")
print(common_item_pairs[['antecedents', 'consequents', 'support']])

Most common item pairs purchased together:
             antecedents         consequents   support
1869  (other vegetables)        (whole milk)  0.191380
1868        (whole milk)  (other vegetables)  0.191380
2049        (rolls/buns)        (whole milk)  0.178553
2048        (whole milk)        (rolls/buns)  0.178553
2187              (soda)        (whole milk)  0.151103
2186        (whole milk)              (soda)  0.151103
2251            (yogurt)        (whole milk)  0.150590
2250        (whole milk)            (yogurt)  0.150590
1830        (rolls/buns)  (other vegetables)  0.146742
1831  (other vegetables)        (rolls/buns)  0.146742


In [42]:
import pandas
rules_df = pandas.DataFrame(rules)
rules_df
rules_df = rules_df.sort_values(by='support', ascending=False)

print("Association rules with the highest support:")
rules_df.head(10)


Association rules with the highest support:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1868,(whole milk),(other vegetables),0.458184,0.376603,0.19138,0.417693,1.109106,0.018827,1.070564,0.181562
1869,(other vegetables),(whole milk),0.376603,0.458184,0.19138,0.508174,1.109106,0.018827,1.101643,0.157802
2048,(whole milk),(rolls/buns),0.458184,0.349666,0.178553,0.389698,1.114484,0.018342,1.065592,0.189591
2049,(rolls/buns),(whole milk),0.349666,0.458184,0.178553,0.510638,1.114484,0.018342,1.10719,0.157955
2187,(soda),(whole milk),0.313494,0.458184,0.151103,0.481997,1.051973,0.007465,1.045971,0.071966
2186,(whole milk),(soda),0.458184,0.313494,0.151103,0.329787,1.051973,0.007465,1.02431,0.091184
2250,(whole milk),(yogurt),0.458184,0.282966,0.15059,0.328667,1.16151,0.02094,1.068076,0.25664
2251,(yogurt),(whole milk),0.282966,0.458184,0.15059,0.532185,1.16151,0.02094,1.158185,0.193926
1830,(rolls/buns),(other vegetables),0.349666,0.376603,0.146742,0.419663,1.114335,0.015056,1.074197,0.157772
1831,(other vegetables),(rolls/buns),0.376603,0.349666,0.146742,0.389646,1.114335,0.015056,1.065502,0.164589


In [43]:
rules_df = rules_df.sort_values(by='confidence', ascending=False)

print("Association rules with the highest confidence:")
rules_df.head(10)

Association rules with the highest confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
7197,"(domestic eggs, meat)",(whole milk),0.013084,0.458184,0.010262,0.784314,1.711789,0.004267,2.512057,0.421328
5866,"(chocolate, fruit/vegetable juice)",(whole milk),0.014366,0.458184,0.010775,0.75,1.636898,0.004192,2.167265,0.39476
15364,"(bottled water, rolls/buns, yogurt, other vege...",(whole milk),0.01411,0.458184,0.010518,0.745455,1.626978,0.004053,2.128564,0.390879
11849,"(bottled water, pip fruit, yogurt)",(whole milk),0.013853,0.458184,0.010262,0.740741,1.616689,0.003914,2.089863,0.386811
12213,"(rolls/buns, yogurt, brown bread)",(whole milk),0.017445,0.458184,0.012827,0.735294,1.604802,0.004834,2.046862,0.383561
11345,"(bottled water, other vegetables, brown bread)",(whole milk),0.016675,0.458184,0.012057,0.723077,1.578138,0.004417,1.956559,0.372554
11317,"(rolls/buns, bottled beer, yogurt)",(whole milk),0.019241,0.458184,0.013853,0.72,1.571422,0.005038,1.935058,0.370768
13025,"(curd, soda, yogurt)",(whole milk),0.015136,0.458184,0.010775,0.711864,1.553666,0.00384,1.880421,0.361838
15424,"(rolls/buns, shopping bags, yogurt, other vege...",(whole milk),0.01411,0.458184,0.010005,0.709091,1.547613,0.00354,1.862494,0.358908
14551,"(pastry, pip fruit, yogurt)",(whole milk),0.014879,0.458184,0.010518,0.706897,1.542823,0.003701,1.84855,0.357152


In [44]:
rules_df = rules_df.sort_values(by='lift', ascending=False)

print("Association rules with the highest lift:")
rules_df.head(10)

Association rules with the highest lift:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
15398,"(whole milk, sausage, other vegetables)","(rolls/buns, yogurt)",0.050282,0.111339,0.013597,0.270408,2.428689,0.007998,1.218025,0.6194
15411,"(rolls/buns, yogurt)","(whole milk, sausage, other vegetables)",0.111339,0.050282,0.013597,0.12212,2.428689,0.007998,1.081831,0.661957
15403,"(rolls/buns, yogurt, other vegetables)","(whole milk, sausage)",0.052335,0.106978,0.013597,0.259804,2.428575,0.007998,1.206467,0.620721
15406,"(whole milk, sausage)","(rolls/buns, yogurt, other vegetables)",0.106978,0.052335,0.013597,0.127098,2.428575,0.007998,1.08565,0.658702
13013,"(whole milk, sausage)","(curd, yogurt)",0.106978,0.040277,0.010005,0.093525,2.322046,0.005696,1.058742,0.637549
13016,"(curd, yogurt)","(whole milk, sausage)",0.040277,0.106978,0.010005,0.248408,2.322046,0.005696,1.188173,0.593239
15399,"(whole milk, sausage, rolls/buns)","(yogurt, other vegetables)",0.048743,0.120318,0.013597,0.278947,2.318415,0.007732,1.219997,0.59781
15410,"(yogurt, other vegetables)","(whole milk, sausage, rolls/buns)",0.120318,0.048743,0.013597,0.113006,2.318415,0.007732,1.072451,0.646451
15396,"(whole milk, yogurt, other vegetables)","(sausage, rolls/buns)",0.071832,0.08235,0.013597,0.189286,2.298554,0.007681,1.131903,0.608665
15413,"(sausage, rolls/buns)","(whole milk, yogurt, other vegetables)",0.08235,0.071832,0.013597,0.165109,2.298554,0.007681,1.111724,0.615642


In [46]:
# Filter the itemsets to find those that include "whole milk"
other_vegetables_rules = rules[rules['antecedents'].apply(lambda x: 'whole milk' in x) | 
                               rules['consequents'].apply(lambda x: 'whole milk' in x)]
top_other_vegetables_rules = other_vegetables_rules.sort_values(by='support', ascending=False).head(10)

print("Top 10 most frequent items purchased together with 'other vegetables':")
top_other_vegetables_rules

Top 10 most frequent items purchased together with 'other vegetables':


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1868,(whole milk),(other vegetables),0.458184,0.376603,0.19138,0.417693,1.109106,0.018827,1.070564,0.181562
1869,(other vegetables),(whole milk),0.376603,0.458184,0.19138,0.508174,1.109106,0.018827,1.101643,0.157802
2049,(rolls/buns),(whole milk),0.349666,0.458184,0.178553,0.510638,1.114484,0.018342,1.10719,0.157955
2048,(whole milk),(rolls/buns),0.458184,0.349666,0.178553,0.389698,1.114484,0.018342,1.065592,0.189591
2187,(soda),(whole milk),0.313494,0.458184,0.151103,0.481997,1.051973,0.007465,1.045971,0.071966
2186,(whole milk),(soda),0.458184,0.313494,0.151103,0.329787,1.051973,0.007465,1.02431,0.091184
2251,(yogurt),(whole milk),0.282966,0.458184,0.15059,0.532185,1.16151,0.02094,1.158185,0.193926
2250,(whole milk),(yogurt),0.458184,0.282966,0.15059,0.328667,1.16151,0.02094,1.068076,0.25664
2226,(whole milk),(tropical fruit),0.458184,0.23371,0.11647,0.254199,1.087672,0.009388,1.027473,0.148768
2227,(tropical fruit),(whole milk),0.23371,0.458184,0.11647,0.498353,1.087672,0.009388,1.080076,0.105189
