# Exercise 2: Implement the A-Priori algorithm

Implement a version of the A-Priori algorithm on your own. You may assume your data is given as a list of baskets.

In [1]:
import pandas as pd
import numpy as np

In [2]:
### read a csv file and extract the shopping items list

df = pd.read_csv('Groceries_dataset.csv')
baskets = [list(set(a[1]['itemDescription'].tolist())) for a in list(df.groupby('Member_number'))]
baskets

### total size of baskets

# len(list(np.concatenate(baskets).flat))

[['sausage',
  'canned beer',
  'soda',
  'salty snack',
  'pickled vegetables',
  'hygiene articles',
  'whole milk',
  'yogurt',
  'pastry',
  'semi-finished bread',
  'misc. beverages'],
 ['white bread',
  'sausage',
  'curd',
  'frankfurter',
  'soda',
  'rolls/buns',
  'whipped/sour cream',
  'beef',
  'whole milk'],
 ['butter',
  'butter milk',
  'other vegetables',
  'tropical fruit',
  'frozen vegetables',
  'whole milk',
  'specialty chocolate',
  'sugar'],
 ['sausage',
  'detergent',
  'dental care',
  'root vegetables',
  'rolls/buns',
  'frozen meals'],
 ['pastry',
  'pip fruit',
  'other vegetables',
  'frozen fish',
  'tropical fruit',
  'canned beer',
  'chocolate',
  'root vegetables',
  'rolls/buns',
  'packaged fruit/vegetables',
  'hygiene articles',
  'shopping bags',
  'whole milk',
  'cling film/bags',
  'red/blush wine',
  'dish cleaner'],
 ['whipped/sour cream', 'margarine', 'rolls/buns'],
 ['chicken',
  'rice',
  'bottled water',
  'frankfurter',
  'chocolate',

In [3]:
### unique #items

items = set(list(np.concatenate(baskets).flat))
len(items)

167

In [4]:
### hash all singletons
df_item_hash = pd.DataFrame(range(len(items)), index = list(items), columns =['hashcode'], dtype=int)
df_item_hash

Unnamed: 0,hashcode
UHT-milk,0
condensed milk,1
cling film/bags,2
baby cosmetics,3
female sanitary products,4
...,...
cream,162
yogurt,163
chewing gum,164
flower (seeds),165


In [5]:
### count the items, store the count into the hashed array index

# item_count = pd.DataFrame(np.zeros((len(items),1)), index = list(items), columns =['count'], dtype=int)
item_count_arr = np.zeros((len(items),1))

for b in baskets:
    for item in b:
            idx = df_item_hash.loc[item,'hashcode']
            item_count_arr[idx] += 1
            
### find frequent items with support > s1 (here s1 = 0.02), and hash back from array index to items           
freq_items  = [df_item_hash[df_item_hash['hashcode']==x].index[0] for x in np.where(item_count_arr > 0.02*len(baskets))[0]] 
freq_items

# item_count_arr[item_count['count']>0.02*len(baskets)]
#freq_items['hashcode'] = list(range(1,len(freq_items)+1))

['UHT-milk',
 'condensed milk',
 'semi-finished bread',
 'butter milk',
 'grapes',
 'brown bread',
 'frozen fish',
 'ham',
 'hard cheese',
 'canned beer',
 'berries',
 'citrus fruit',
 'pastry',
 'frankfurter',
 'coffee',
 'packaged fruit/vegetables',
 'pasta',
 'whipped/sour cream',
 'dessert',
 'frozen meals',
 'domestic eggs',
 'turkey',
 'ice cream',
 'white wine',
 'oil',
 'frozen dessert',
 'misc. beverages',
 'napkins',
 'cake bar',
 'cat food',
 'tropical fruit',
 'flour',
 'soft cheese',
 'hamburger meat',
 'whole milk',
 'margarine',
 'specialty chocolate',
 'pet care',
 'other vegetables',
 'seasonal products',
 'rolls/buns',
 'shopping bags',
 'onions',
 'butter',
 'herbs',
 'long life bakery product',
 'newspapers',
 'salty snack',
 'bottled beer',
 'pork',
 'red/blush wine',
 'chicken',
 'detergent',
 'beverages',
 'bottled water',
 'specialty bar',
 'roll products ',
 'canned vegetables',
 'root vegetables',
 'pot plants',
 'beef',
 'curd',
 'baking powder',
 'sliced che

In [6]:
### hash the frequent items (starting from 1)

df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])
df_freq_item_hash

Unnamed: 0,hashcode
UHT-milk,1
condensed milk,2
semi-finished bread,3
butter milk,4
grapes,5
...,...
pickled vegetables,83
fruit/vegetable juice,84
yogurt,85
chewing gum,86


In [7]:
### triangular array encode function, (not used)
# def triangular_encode(i,j,n):
#     return int((i-1)*(n-i/2)+j-i)

In [8]:
### count the pairs using only frequent items, store the count into the (triangular) matrix.

# pair_mat = pd.DataFrame(np.zeros((len(freq_items.index),len(freq_items.index))), 
#                         columns=freq_items.index, index=freq_items.index,
#                        dtype=int)

pair_mat_hashed = np.zeros((len(freq_items)+1,len(freq_items)+1))
# n = len(freq_items)
# triangular_arr = np.zeros((n*n,))


for b in baskets:
    cand_list = [item for item in b if item in freq_items]
    if len(cand_list)<2:
        continue
    for idx, item1 in enumerate(cand_list):
        for item2 in cand_list[idx+1:]:
            i = df_freq_item_hash.loc[item1,'hashcode'] 
            j = df_freq_item_hash.loc[item2,'hashcode'] 
            #triangular_arr[triangular_encode(i,j,n)] +=1
            #pair_mat.loc[item1, item2] += 1
            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
pair_mat_hashed

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0., 91., 34., ...,  0.,  0.,  0.],
       [ 0., 15., 10., ..., 69.,  0.,  0.],
       [ 0., 25.,  6., ..., 82., 15.,  0.]])

In [9]:
### extract frequent pairs that exceed support s2 (assume s2 = 0.02), and hash back.

freq_pairs = [[df_freq_item_hash[df_freq_item_hash['hashcode']==x].index[0], df_freq_item_hash[df_freq_item_hash['hashcode']==y].index[0]] for x, y in zip(*np.where(pair_mat_hashed > 0.02*len(baskets)))]
freq_pairs
# freq_pairs = [[freq_itemset[x], freq_itemset[y]] for x, y in zip(*np.where(pair_mat.values > 0.02*len(baskets)))]

[['canned beer', 'brown bread'],
 ['citrus fruit', 'brown bread'],
 ['citrus fruit', 'canned beer'],
 ['pastry', 'brown bread'],
 ['pastry', 'canned beer'],
 ['pastry', 'citrus fruit'],
 ['frankfurter', 'brown bread'],
 ['frankfurter', 'canned beer'],
 ['frankfurter', 'citrus fruit'],
 ['frankfurter', 'pastry'],
 ['coffee', 'canned beer'],
 ['coffee', 'citrus fruit'],
 ['coffee', 'pastry'],
 ['coffee', 'frankfurter'],
 ['whipped/sour cream', 'brown bread'],
 ['whipped/sour cream', 'canned beer'],
 ['whipped/sour cream', 'citrus fruit'],
 ['whipped/sour cream', 'pastry'],
 ['whipped/sour cream', 'frankfurter'],
 ['domestic eggs', 'brown bread'],
 ['domestic eggs', 'canned beer'],
 ['domestic eggs', 'citrus fruit'],
 ['domestic eggs', 'pastry'],
 ['domestic eggs', 'frankfurter'],
 ['domestic eggs', 'coffee'],
 ['domestic eggs', 'whipped/sour cream'],
 ['tropical fruit', 'UHT-milk'],
 ['tropical fruit', 'brown bread'],
 ['tropical fruit', 'canned beer'],
 ['tropical fruit', 'berries'],
 [

In [10]:
len(freq_pairs)

499

# Exercise 3: Use built in tools
Use/import the following Python packages: Pandas and MLxtend.  
Especially, have a look at apriori and association rules from mlxtend.frequent patterns.  
For documentation see: http://rasbt.github.io/mlxtend/

If helpful / desirable you might also use TransactionEncoder from mlxtend.preprocessing to clean / prepare your data.

The task: determine:
1. the frequent pairs of items.
2. the association rules of high confidence with or w/o high lift.
3. (optional) the association rules of high confidence with or w/o high interest. (optional)

In [11]:
# ! pip install mlxtend

In [12]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

**Solutions**  
1. The frequent pairs of items.  
**I'm mainly refering to this documentation examples:** [reference](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#apriori-frequent-itemsets-via-the-apriori-algorithm)

In [13]:
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False


In [14]:
frq_items = apriori(df_one_hot, min_support = 0.02, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

Unnamed: 0,support,itemsets,length
0,0.078502,(UHT-milk),1
1,0.031042,(baking powder),1
2,0.119548,(beef),1
3,0.079785,(berries),1
4,0.062083,(beverages),1
...,...,...,...
889,0.027963,"(whole milk, soda, other vegetables, yogurt)",4
890,0.021293,"(tropical fruit, whole milk, other vegetables,...",4
891,0.021036,"(sausage, soda, whole milk, rolls/buns)",4
892,0.022832,"(sausage, yogurt, whole milk, rolls/buns)",4


In [15]:
### reformat a little, to put the frozenset into lists

ml_freq_items = []
for i in frq_items[frq_items['length']==1].itemsets.values:
    ml_frq_items.extend(list(i))
    
ml_freq_pairs = []
for i in frq_items[frq_items['length']==2].itemsets.values:
    ml_frq_pairs.append(list(i))

### check if the frequent itemsets found by ourselves and mlxtend are the same
for i in ml_frq_items:
    if i not in freq_items:
        print(i)
        
len(ml_frq_pairs)==len(freq_pairs)

NameError: name 'ml_frq_items' is not defined

2. the association rules of high confidence with or w/o high lift.  
[doc example](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/#association_rules-association-rules-generation-from-frequent-itemsets)

In [None]:
association_rules(frq_items, metric="confidence", min_threshold=0.6)

In [None]:
association_rules(frq_items, metric="lift", min_threshold=1.2)