In [1]:
import pandas as pd

import plotly.express as px


In [2]:
data = pd.read_csv("Market_Basket_Optimisation.csv", header=None)
print(data.shape)
data.head(10)


(7501, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
5,low fat yogurt,,,,,,,,,,,,,,,,,,,
6,whole wheat pasta,french fries,,,,,,,,,,,,,,,,,,
7,soup,light cream,shallot,,,,,,,,,,,,,,,,,
8,frozen vegetables,spaghetti,green tea,,,,,,,,,,,,,,,,,
9,french fries,,,,,,,,,,,,,,,,,,,


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
 1   1       5747 non-null   object
 2   2       4389 non-null   object
 3   3       3345 non-null   object
 4   4       2529 non-null   object
 5   5       1864 non-null   object
 6   6       1369 non-null   object
 7   7       981 non-null    object
 8   8       654 non-null    object
 9   9       395 non-null    object
 10  10      256 non-null    object
 11  11      154 non-null    object
 12  12      87 non-null     object
 13  13      47 non-null     object
 14  14      25 non-null     object
 15  15      8 non-null      object
 16  16      4 non-null      object
 17  17      4 non-null      object
 18  18      3 non-null      object
 19  19      1 non-null      object
dtypes: object(20)
memory usage: 1.1+ MB


{bread} => {butter}


- Create a frequency table of all items
- Identify items that are significant, i.e support(item) $\geq$ support threshold
  $$ \text{support}(A \Rightarrow B) = P(A U B)$$
  $$ \text{support} = \frac{\text{number of transaction with item(s)}}{\text{total number of transactions}}$$
- Make all possible combinations of items that are significant
- Take frequency of each combination
- Pass only sigificant combinations to the next iteration
- Take frequency of three item set (self join rule)


In [4]:
all_items = data.values.astype(str).tolist()
print(all_items[2])


['chutney', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']


In [5]:
# flatten all_items
all_items_list = [item for sublist in all_items for item in sublist if item != "nan"]
print(all_items_list[:10])


['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice']


In [6]:
# count the frequency of each item
from collections import Counter

item_counts = Counter(all_items_list)
print(item_counts)


Counter({'mineral water': 1788, 'eggs': 1348, 'spaghetti': 1306, 'french fries': 1282, 'chocolate': 1230, 'green tea': 991, 'milk': 972, 'ground beef': 737, 'frozen vegetables': 715, 'pancakes': 713, 'burgers': 654, 'cake': 608, 'cookies': 603, 'escalope': 595, 'low fat yogurt': 574, 'shrimp': 536, 'tomatoes': 513, 'olive oil': 494, 'frozen smoothie': 475, 'turkey': 469, 'chicken': 450, 'whole wheat rice': 439, 'grated cheese': 393, 'cooking oil': 383, 'soup': 379, 'herb & pepper': 371, 'honey': 356, 'champagne': 351, 'fresh bread': 323, 'salmon': 319, 'brownies': 253, 'avocado': 250, 'hot dogs': 243, 'cottage cheese': 239, 'tomato juice': 228, 'butter': 226, 'whole wheat pasta': 221, 'red wine': 211, 'yogurt cake': 205, 'light mayo': 204, 'energy bar': 203, 'ham': 203, 'energy drink': 200, 'pepper': 199, 'vegetables mix': 193, 'cereals': 193, 'muffins': 181, 'oil': 173, 'french wine': 169, 'fresh tuna': 167, 'strawberries': 160, 'meatballs': 157, 'almonds': 153, 'parmesan cheese': 149

In [7]:
items_freq = pd.DataFrame(item_counts.items(), columns=["item", "count"]).sort_values(
    "count", ascending=False
)
items_freq.head()


Unnamed: 0,item,count
14,mineral water,1788
22,eggs,1348
34,spaghetti,1306
29,french fries,1282
39,chocolate,1230


In [8]:
# plot the top 40 most frequent items using plotly express

fig = px.bar(
    items_freq[:40],
    x="item",
    y="count",
    color="count",
    title="Top 40 most frequent items",
    text_auto=True,
)
fig.show()


In [9]:
print(f"Total number of transactions: {len(all_items)}")
print(f"Total number of unique items: {len(item_counts)}")


Total number of transactions: 7501
Total number of unique items: 120


In [10]:
print(all_items[0])


['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil']


In [11]:
# one hot encode the data
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit(all_items).transform(all_items)
df = pd.DataFrame(te_data, columns=te.columns_)
df.head()


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [12]:
# calculate the support of each item

support = (
    df.mean()
    .reset_index(name="support")
    .rename(columns={"index": "item"})
    .sort_values(by="support", ascending=False)
    .set_index("item")
    .drop("nan", axis=0)
    .reset_index()
)
support.head(10)


Unnamed: 0,item,support
0,mineral water,0.238368
1,eggs,0.179709
2,spaghetti,0.17411
3,french fries,0.170911
4,chocolate,0.163845
5,green tea,0.132116
6,milk,0.129583
7,ground beef,0.098254
8,frozen vegetables,0.095321
9,pancakes,0.095054


In [13]:
# make all the combinations of items

from itertools import combinations

# rules = list(combinations(set(all_items_list), 2))
rules = list(combinations(items_freq['item'], 2))
print(rules[:5])

[('mineral water', 'eggs'), ('mineral water', 'spaghetti'), ('mineral water', 'french fries'), ('mineral water', 'chocolate'), ('mineral water', 'green tea')]


In [53]:
list(combinations(['veronica', 'emmanuel', 'uthman', 'esther'], 3))

[('veronica', 'emmanuel', 'uthman'),
 ('veronica', 'emmanuel', 'esther'),
 ('veronica', 'uthman', 'esther'),
 ('emmanuel', 'uthman', 'esther')]