In [1]:
import numpy as np
import pandas as pd

import plotly.express as px


In [2]:
data = pd.read_csv("Market_Basket_Optimisation.csv", header=None)
print(data.shape)
data.head(10)


(7501, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
5,low fat yogurt,,,,,,,,,,,,,,,,,,,
6,whole wheat pasta,french fries,,,,,,,,,,,,,,,,,,
7,soup,light cream,shallot,,,,,,,,,,,,,,,,,
8,frozen vegetables,spaghetti,green tea,,,,,,,,,,,,,,,,,
9,french fries,,,,,,,,,,,,,,,,,,,


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
 1   1       5747 non-null   object
 2   2       4389 non-null   object
 3   3       3345 non-null   object
 4   4       2529 non-null   object
 5   5       1864 non-null   object
 6   6       1369 non-null   object
 7   7       981 non-null    object
 8   8       654 non-null    object
 9   9       395 non-null    object
 10  10      256 non-null    object
 11  11      154 non-null    object
 12  12      87 non-null     object
 13  13      47 non-null     object
 14  14      25 non-null     object
 15  15      8 non-null      object
 16  16      4 non-null      object
 17  17      4 non-null      object
 18  18      3 non-null      object
 19  19      1 non-null      object
dtypes: object(20)
memory usage: 1.1+ MB


{bread} => {butter}


### Apriori Steps

- Create a frequency table of all items
- Identify items that are significant, i.e support(item) $\geq$ support threshold
  $$ \text{support}(A \Rightarrow B) = P(A U B)$$
  $$ \text{support} = \frac{\text{number of transaction with item(s)}}{\text{total number of transactions}}$$
- Make all possible combinations of items that are significant
- Take frequency of each combination
- Pass only sigificant combinations to the next iteration
- Take frequency of three item set (self join rule)


In [4]:
type(str(np.nan))

str

In [5]:
all_items = data.values.astype(str).tolist()
print(all_items[2])


['chutney', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']


In [6]:
# flatten all_items
all_items_list = [item for sublist in all_items for item in sublist if item != "nan"]
print(all_items_list[:10])


['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice']


In [7]:
# count the frequency of each item
from collections import Counter

item_counts = Counter(all_items_list)
print(item_counts)


Counter({'mineral water': 1788, 'eggs': 1348, 'spaghetti': 1306, 'french fries': 1282, 'chocolate': 1230, 'green tea': 991, 'milk': 972, 'ground beef': 737, 'frozen vegetables': 715, 'pancakes': 713, 'burgers': 654, 'cake': 608, 'cookies': 603, 'escalope': 595, 'low fat yogurt': 574, 'shrimp': 536, 'tomatoes': 513, 'olive oil': 494, 'frozen smoothie': 475, 'turkey': 469, 'chicken': 450, 'whole wheat rice': 439, 'grated cheese': 393, 'cooking oil': 383, 'soup': 379, 'herb & pepper': 371, 'honey': 356, 'champagne': 351, 'fresh bread': 323, 'salmon': 319, 'brownies': 253, 'avocado': 250, 'hot dogs': 243, 'cottage cheese': 239, 'tomato juice': 228, 'butter': 226, 'whole wheat pasta': 221, 'red wine': 211, 'yogurt cake': 205, 'light mayo': 204, 'energy bar': 203, 'ham': 203, 'energy drink': 200, 'pepper': 199, 'vegetables mix': 193, 'cereals': 193, 'muffins': 181, 'oil': 173, 'french wine': 169, 'fresh tuna': 167, 'strawberries': 160, 'meatballs': 157, 'almonds': 153, 'parmesan cheese': 149

In [8]:
items_freq = pd.DataFrame(item_counts.items(), columns=["item", "count"]).sort_values(
    "count", ascending=False
)
items_freq.head()


Unnamed: 0,item,count
14,mineral water,1788
22,eggs,1348
34,spaghetti,1306
29,french fries,1282
39,chocolate,1230


In [9]:
# plot the top 40 most frequent items using plotly express

fig = px.bar(
    items_freq[:40],
    x="item",
    y="count",
    color="count",
    title="Top 40 most frequent items",
    text_auto=True,
)
fig.show()


In [10]:
items_freq.iloc[-41:-1,:].shape

(40, 2)

In [11]:
# plot the top 40 least frequent items using plotly express

fig = px.bar(
    items_freq.tail(40),
    x="item",
    y="count",
    color="count",
    title="Top 40 least frequent items",
    text_auto=True,
)
fig.show()

In [12]:
print(f"Total number of transactions: {len(all_items)}")
print(f"Total number of unique items: {len(item_counts)}")


Total number of transactions: 7501
Total number of unique items: 120


In [13]:
print(all_items[0])


['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil']


In [14]:
# one hot encode the data
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit(all_items).transform(all_items)
freq_data = pd.DataFrame(te_data, columns=te.columns_)
freq_data.head()


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [15]:
# calculate the support of each item

support = (
    freq_data.mean()
    .reset_index(name="support")
    .rename(columns={"index": "item"})
    .sort_values(by="support", ascending=False)
    .set_index("item")
    .drop("nan", axis=0)
    .reset_index()
)
support.head(10)


Unnamed: 0,item,support
0,mineral water,0.238368
1,eggs,0.179709
2,spaghetti,0.17411
3,french fries,0.170911
4,chocolate,0.163845
5,green tea,0.132116
6,milk,0.129583
7,ground beef,0.098254
8,frozen vegetables,0.095321
9,pancakes,0.095054


In [16]:
support_items_thresh = support[support['support'] >= 0.05]['item'].to_list()
print(support_items_thresh)

['mineral water', 'eggs', 'spaghetti', 'french fries', 'chocolate', 'green tea', 'milk', 'ground beef', 'frozen vegetables', 'pancakes', 'burgers', 'cake', 'cookies', 'escalope', 'low fat yogurt', 'shrimp', 'tomatoes', 'olive oil', 'frozen smoothie', 'turkey', 'chicken', 'whole wheat rice', 'grated cheese', 'cooking oil', 'soup']


In [17]:
# make all the combinations of items

from itertools import combinations

# rules = list(combinations(set(all_items_list), 2))
rules = list(combinations(support_items_thresh, 2))
print(rules[:5])

[('mineral water', 'eggs'), ('mineral water', 'spaghetti'), ('mineral water', 'french fries'), ('mineral water', 'chocolate'), ('mineral water', 'green tea')]


In [18]:
freq_data[list(('mineral water', 'eggs'))]

Unnamed: 0,mineral water,eggs
0,True,False
1,False,True
2,False,False
3,False,False
4,True,False
...,...,...
7496,False,False
7497,False,True
7498,False,False
7499,False,False


In [19]:
freq_data[list(('mineral water', 'eggs'))].all(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
7496    False
7497    False
7498    False
7499    False
7500    False
Length: 7501, dtype: bool

In [20]:
freq_data[list(('mineral water', 'eggs'))].all(axis=1).sum()

382

In [21]:
382 / 7501

0.05092654312758299

In [22]:
rules[:2]

[('mineral water', 'eggs'), ('mineral water', 'spaghetti')]

In [23]:
# create support and freq table for each rule

rules_freq_support = pd.DataFrame(columns=["item", "freq_AB", "support_AB"])


for i, rule in enumerate(rules):
    freq_AB = freq_data[list(rule)].all(axis=1).sum()
    rules_freq_support.loc[i, "freq_AB"] = freq_AB
    
    support_AB = freq_data[list(rule)].all(axis=1).mean()
    rules_freq_support.loc[i, "support_AB"] = support_AB

rules_freq_support["item"] = rules
rules_freq_support.head()


Unnamed: 0,item,freq_AB,support_AB
0,"(mineral water, eggs)",382,0.050927
1,"(mineral water, spaghetti)",448,0.059725
2,"(mineral water, french fries)",253,0.033729
3,"(mineral water, chocolate)",395,0.05266
4,"(mineral water, green tea)",233,0.031063


### Other Metrics

- Confidence: How likely is it if A is bought, B will be bought?
  $$ \text{confidence}(A \Rightarrow B) = \frac{\text{support}(A \Rightarrow B)}{\text{support}(A)}$$
  $$ \text{confidence}(B \Rightarrow A) = \frac{\text{support}(A \Rightarrow B)}{\text{support}(B)}$$

- Lift: How likely is it if A is bought, B will be bought? (Normalized by support of B)
  $$ \text{lift}(A \Rightarrow B) = \frac{\text{support}(A \Rightarrow B)}{\text{support}(A) \times \text{support}(B)}$$
  
  Note: Lift is symmetric, i.e. $\text{lift}(A \Rightarrow B) = \text{lift}(B \Rightarrow A)$

- Conviction: How likely is it if A is bought, B will be bought? (Normalized by support of B and confidence of A)
  $$ \text{conviction}(A \Rightarrow B) = \frac{1 - \text{support}(B)}{1 - \text{confidence}(A \Rightarrow B)}$$
  $$ \text{conviction}(B \Rightarrow A) = \frac{1 - \text{support}(A)}{1 - \text{confidence}(A \Rightarrow B)}$$
  
  Note: Conviction is asymmetric, i.e. $\text{conviction}(A \Rightarrow B) \neq \text{conviction}(B \Rightarrow A)$

In [24]:
# rules_freq_support['item'].head(10)

In [25]:
# freq_data.head()

In [26]:
# freq_data['mineral water'].sum()

In [27]:
metrics_df = rules_freq_support.copy()

In [28]:
metrics_df["freq_A"] = metrics_df["item"].apply(lambda x: freq_data[x[0]].sum())
metrics_df["freq_B"] = metrics_df["item"].apply(lambda x: freq_data[x[1]].sum())

metrics_df["support_A"] = metrics_df["item"].apply(lambda x: freq_data[x[0]].mean())
metrics_df["support_B"] = metrics_df["item"].apply(lambda x: freq_data[x[1]].mean())

metrics_df.head()


Unnamed: 0,item,freq_AB,support_AB,freq_A,freq_B,support_A,support_B
0,"(mineral water, eggs)",382,0.050927,1788,1348,0.238368,0.179709
1,"(mineral water, spaghetti)",448,0.059725,1788,1306,0.238368,0.17411
2,"(mineral water, french fries)",253,0.033729,1788,1282,0.238368,0.170911
3,"(mineral water, chocolate)",395,0.05266,1788,1229,0.238368,0.163845
4,"(mineral water, green tea)",233,0.031063,1788,991,0.238368,0.132116


In [29]:
# confidence
metrics_df['confidence_AB'] = metrics_df['support_AB'] / metrics_df['support_A']
metrics_df['confidence_BA'] = metrics_df['support_AB'] / metrics_df['support_B']

# lift
metrics_df['lift'] = metrics_df['support_AB'] / (metrics_df['support_A'] * metrics_df['support_B'])

# conviction
metrics_df['conviction_AB'] = (1 - metrics_df['support_B']) / (1 - metrics_df['confidence_AB'])
metrics_df['conviction_BA'] = (1 - metrics_df['support_A']) / (1 - metrics_df['confidence_BA'])

metrics_df.head()

Unnamed: 0,item,freq_AB,support_AB,freq_A,freq_B,support_A,support_B,confidence_AB,confidence_BA,lift,conviction_AB,conviction_BA
0,"(mineral water, eggs)",382,0.050927,1788,1348,0.238368,0.179709,0.213647,0.283383,1.188845,1.043158,1.062815
1,"(mineral water, spaghetti)",448,0.059725,1788,1306,0.238368,0.17411,0.250559,0.343032,1.439085,1.102008,1.159314
2,"(mineral water, french fries)",253,0.033729,1788,1282,0.238368,0.170911,0.141499,0.197348,0.827912,0.965741,0.948894
3,"(mineral water, chocolate)",395,0.05266,1788,1229,0.238368,0.163845,0.220917,0.3214,1.348332,1.073256,1.122357
4,"(mineral water, green tea)",233,0.031063,1788,991,0.238368,0.132116,0.130313,0.235116,0.986357,0.997927,0.995748


In [30]:
metrics_df.sort_values(by='lift', ascending=False).head(10)

Unnamed: 0,item,freq_AB,support_AB,freq_A,freq_B,support_A,support_B,confidence_AB,confidence_BA,lift,conviction_AB,conviction_BA
278,"(olive oil, soup)",67,0.008932,494,379,0.065858,0.050527,0.135628,0.176781,2.68428,1.098454,1.134743
171,"(frozen vegetables, tomatoes)",121,0.016131,715,513,0.095321,0.068391,0.169231,0.235867,2.474464,1.121381,1.18393
170,"(frozen vegetables, shrimp)",125,0.016664,715,536,0.095321,0.071457,0.174825,0.233209,2.446574,1.125268,1.179825
146,"(milk, soup)",114,0.015198,972,379,0.129583,0.050527,0.117284,0.300792,2.321232,1.075627,1.244861
255,"(shrimp, tomatoes)",84,0.011199,536,513,0.071457,0.068391,0.156716,0.163743,2.291481,1.10474,1.110355
51,"(spaghetti, ground beef)",294,0.039195,1306,737,0.17411,0.098254,0.225115,0.398915,2.291162,1.163716,1.373997
292,"(chicken, cooking oil)",51,0.006799,450,383,0.059992,0.05106,0.113333,0.133159,2.219617,1.070233,1.084407
161,"(ground beef, grated cheese)",85,0.011332,737,393,0.098254,0.052393,0.115332,0.216285,2.201294,1.071145,1.150605
156,"(ground beef, olive oil)",106,0.014131,737,494,0.098254,0.065858,0.143826,0.214575,2.183889,1.091066,1.1481
297,"(grated cheese, cooking oil)",43,0.005733,393,383,0.052393,0.05106,0.109415,0.112272,2.142872,1.065524,1.067451


In [50]:
metrics_df[metrics_df['item'] == ('mineral water', 'spaghetti')]

Unnamed: 0,item,freq_AB,support_AB,freq_A,freq_B,support_A,support_B,confidence_AB,confidence_BA,lift,conviction_AB,conviction_BA
1,"(mineral water, spaghetti)",448,0.059725,1788,1306,0.238368,0.17411,0.250559,0.343032,1.439085,1.102008,1.159314


In [31]:
from mlxtend.frequent_patterns import apriori 

In [33]:
# help(apriori)

In [46]:
frequent_itemsets = apriori(
    freq_data.drop("nan", axis=1), min_support=0.05, use_colnames=True, max_len=3
)
frequent_itemsets.sort_values(by="support", ascending=False).sample(10)


Unnamed: 0,support,itemsets
8,0.170911,(french fries)
18,0.095054,(pancakes)
25,0.05266,"(mineral water, chocolate)"
16,0.238368,(mineral water)
22,0.068391,(tomatoes)
20,0.050527,(soup)
3,0.163845,(chocolate)
0,0.087188,(burgers)
27,0.059725,"(spaghetti, mineral water)"
23,0.062525,(turkey)


In [47]:
from mlxtend.frequent_patterns import association_rules

# help(association_rules)

In [49]:
association_rules(frequent_itemsets, metric="lift", min_threshold=1).sort_values(
    by="lift", ascending=False
).head(10)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
5,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
1,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
0,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
2,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815
3,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158
