## Introduction
For small data set, there are 2 packages here to explore:
* efficient_apriori. the input data should be list of tuples. no data transformation needed
* mlxtend: need to do oneHot encoding. the data format should be only contains items with index
* Big dataset, we could use FPGrowth

In [91]:
from efficient_apriori import apriori
from icecream import ic
import pandas as pd
pd.options.display.max_columns=100

## Case1: Simple basket show case

In [23]:
transactions = [
    ('milk','bread', 'diaper'),
    ('coca','bread', 'diaper', 'beer'),
    ('milk','diaper', 'beer', 'egg'),
    ('bread','milk', 'diaper', 'beer'),
    ('milk','bread', 'diaper', 'coca'),
]
itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1)
print(f'Frequent Itemset:\r\n{itemsets}')
print(f'Association rules:\r\n{rules}')

Frequent Itemset:
{1: {('bread',): 4, ('diaper',): 5, ('milk',): 4, ('beer',): 3}, 2: {('bread', 'diaper'): 4, ('bread', 'milk'): 3, ('diaper', 'milk'): 4, ('beer', 'diaper'): 3}, 3: {('bread', 'diaper', 'milk'): 3}}
Association rules:
[{bread} -> {diaper}, {milk} -> {diaper}, {beer} -> {diaper}, {bread, milk} -> {diaper}]


## Case2: BreadBasket
There is no header in this dataset raw file

In [48]:
data = pd.read_csv('.\Data\BreadBasket_DMS.csv')
print(data.info())
print(data.head())
data.Item.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21293 entries, 0 to 21292
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         21293 non-null  object
 1   Time         21293 non-null  object
 2   Transaction  21293 non-null  int64 
 3   Item         21293 non-null  object
dtypes: int64(1), object(3)
memory usage: 665.5+ KB
None
         Date      Time  Transaction           Item
0  2016/10/30   9:58:11            1           NONE
1  2016/10/30  10:05:34            2   Scandinavian
2  2016/10/30  10:05:34            2   Scandinavian
3  2016/10/30  10:07:57            3  Hot chocolate
4  2016/10/30  10:07:57            3            Jam


95

In [53]:
# unified items as low case expression
data['Item'] = data['Item'].str.lower()
# drop nono item
data=data.drop(data[data['Item']=='none'].index)

### Efficient_apriori
#### Create data array to meet the raw data format of efficient_apriori

In [60]:
# create one dimension series which index is transaction and values are Item
orders = data.set_index('Transaction')['Item'].sort_index()
ic(type(orders))
orders.head(7)

ic| type(orders): <class 'pandas.core.series.Series'>


Transaction
2     scandinavian
2     scandinavian
3    hot chocolate
3              jam
3          cookies
4           muffin
5           coffee
Name: Item, dtype: object

In [75]:
transactions = []
tmp_idx =0
for i, item in orders.items():
    if i !=tmp_idx:
        itemSet= set()
        itemSet.add(item)
        tmp_idx =i
        transactions.append(itemSet) #ItemSet is growing with the sameid

    else:
        itemSet.add(item)
    


In [78]:
itemSet, rules = apriori(transactions,  min_support=0.02, min_confidence= 0.5)
print(f'Frequent Itemset:\r\n{itemSet}')
print(f'Association rules:\r\n{rules}')

Frequent Itemset:
{1: {('scandinavian',): 275, ('cookies',): 515, ('hot chocolate',): 552, ('muffin',): 364, ('pastry',): 815, ('bread',): 3096, ('coffee',): 4528, ('medialuna',): 585, ('tea',): 1350, ('farm house',): 371, ('juice',): 365, ('soup',): 326, ('cake',): 983, ('sandwich',): 680, ('alfajores',): 344, ('brownie',): 379, ('truffles',): 192, ('toast',): 318, ('scone',): 327}, 2: {('bread', 'coffee'): 852, ('bread', 'pastry'): 276, ('coffee', 'pastry'): 450, ('coffee', 'medialuna'): 333, ('coffee', 'tea'): 472, ('bread', 'tea'): 266, ('coffee', 'juice'): 195, ('coffee', 'hot chocolate'): 280, ('coffee', 'cookies'): 267, ('cake', 'coffee'): 518, ('cake', 'tea'): 225, ('bread', 'cake'): 221, ('coffee', 'sandwich'): 362, ('coffee', 'toast'): 224}}
Association rules:
[{pastry} -> {coffee}, {medialuna} -> {coffee}, {juice} -> {coffee}, {hot chocolate} -> {coffee}, {cookies} -> {coffee}, {cake} -> {coffee}, {sandwich} -> {coffee}, {toast} -> {coffee}]


### mlxtend

In [100]:
from mlxtend.frequent_patterns import apriori, association_rules

In [97]:
def encode_units(x):
    if x <= 0:
        return 0
    elif x>=1:
        return 1

In [122]:
# Create one hot encoding
hot_encoded_df = data.groupby(['Transaction', 'Item'])['Item'].count().unstack().fillna(0)
hot_encoded_df = hot_encoded_df.applymap(encode_units)
hot_encoded_df.head()

Item,adjustment,afternoon with the baker,alfajores,argentina night,art tray,bacon,baguette,bakewell,bare popcorn,basket,bowl nic pitt,bread,bread pudding,brioche and salami,brownie,cake,caramel bites,cherry me dried fruit,chicken sand,chicken stew,chimichurri oil,chocolates,christmas common,coffee,coffee granules,coke,cookies,crepes,crisps,drinking chocolate spoons,duck egg,dulce de leche,eggs,ella's kitchen pouches,empanadas,extra salami or feta,fairy doors,farm house,focaccia,frittata,fudge,gift voucher,gingerbread syrup,granola,hack the stack,half slice monster,hearty & seasonal,honey,hot chocolate,jam,jammie dodgers,juice,keeping it local,kids biscuit,lemon and coconut,medialuna,mighty protein,mineral water,mortimer,muesli,muffin,my-5 fruit shoot,nomad bag,olum & polenta,panatone,pastry,pick and mix bowls,pintxos,polenta,postcard,raspberry shortbread sandwich,raw bars,salad,sandwich,scandinavian,scone,siblings,smoothies,soup,spanish brunch,spread,tacos/fajita,tartine,tea,the bart,the nomad,tiffin,toast,truffles,tshirt,valentine's card,vegan feast,vegan mincepie,victorian sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [107]:
itemSet = apriori(hot_encoded_df,  min_support=0.02, use_colnames = True)
print(f'Frequent Itemset:\r\n{itemSet}')
rules = association_rules(itemSet, metric='lift', min_threshold= 1)
print(f'Association rules:\r\n',rules[(rules['lift']>=1) & (rules['confidence'] >=0.5)])

Frequent Itemset:
     support                 itemsets
0   0.036348              (alfajores)
1   0.327134                  (bread)
2   0.040046                (brownie)
3   0.103867                   (cake)
4   0.478445                 (coffee)
5   0.054417                (cookies)
6   0.039201             (farm house)
7   0.058326          (hot chocolate)
8   0.038567                  (juice)
9   0.061813              (medialuna)
10  0.038462                 (muffin)
11  0.086116                 (pastry)
12  0.071851               (sandwich)
13  0.029057           (scandinavian)
14  0.034552                  (scone)
15  0.034446                   (soup)
16  0.142646                    (tea)
17  0.033601                  (toast)
18  0.020287               (truffles)
19  0.023352            (bread, cake)
20  0.090025          (bread, coffee)
21  0.029163          (pastry, bread)
22  0.028107             (tea, bread)
23  0.054734           (cake, coffee)
24  0.023774              (tea, 

## Case3: MovieLens

In [112]:
movie = pd.read_csv('./Data/MovieLens/movies.csv')

display(movie.info())
movie.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


None

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [121]:
movies_hot_encoded =movie.drop('genres', axis=1).join(movie.genres.str.get_dummies())
movies_hot_encoded.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [125]:
movies_hot_encoded = movies_hot_encoded.set_index(['movieId','title'])
movies_hot_encoded.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [133]:
itemSet = apriori(movies_hot_encoded, min_support=0.02, use_colnames=True).\
sort_values(by='support', ascending=False)
itemSet

Unnamed: 0,support,itemsets
7,0.489185,(Drama)
4,0.306987,(Comedy)
14,0.153164,(Thriller)
12,0.151294,(Romance)
0,0.129042,(Action)
5,0.107743,(Crime)
9,0.095718,(Horror)
31,0.094325,"(Romance, Drama)"
26,0.093335,"(Comedy, Drama)"
6,0.090586,(Documentary)


In [137]:
rules= association_rules(itemSet, metric='lift', min_threshold=2).\
sort_values(by='lift', ascending =False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
9,(Thriller),(Mystery),0.153164,0.055503,0.029144,0.190282,3.428352,0.020643,1.166453
8,(Mystery),(Thriller),0.055503,0.153164,0.029144,0.525099,3.428352,0.020643,1.783185
15,(Crime),"(Thriller, Drama)",0.107743,0.06848,0.024965,0.231711,3.383632,0.017587,1.212461
12,"(Thriller, Drama)",(Crime),0.06848,0.107743,0.024965,0.364561,3.383632,0.017587,1.404159
7,(Action),(Adventure),0.129042,0.08538,0.035633,0.276136,3.234198,0.024616,1.263525
6,(Adventure),(Action),0.08538,0.129042,0.035633,0.417347,3.234198,0.024616,1.494813
16,(Sci-Fi),(Action),0.063898,0.129042,0.023499,0.367757,2.849906,0.015253,1.377568
17,(Action),(Sci-Fi),0.129042,0.063898,0.023499,0.182102,2.849906,0.015253,1.144523
0,(Thriller),(Crime),0.153164,0.107743,0.045165,0.294878,2.736877,0.028662,1.265394
1,(Crime),(Thriller),0.107743,0.153164,0.045165,0.41919,2.736877,0.028662,1.458027
