## mlxtend apriori algorithm for association rule mining

In [1]:
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
# this site has a great description of the api at the bootom

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

### overview

In [3]:
# Apriori is a popular algorithm [1] for extracting frequent itemsets with applications in association rule learning. 
# The apriori algorithm has been designed to operate on databases containing transactions, such as purchases by 
# customers of a store. An itemset is considered as "frequent" if it meets a user-specified support threshold. For 
# instance, if the support threshold is set to 0.5 (50%), a frequent itemset is defined as a set of items that occur 
# together in at least 50% of all transactions in the database.

### example 1 - generating frequent itemsets

#### the transaction data

In [2]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
dataset

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [6]:
help(TransactionEncoder)

Help on class TransactionEncoder in module mlxtend.preprocessing.transactionencoder:

class TransactionEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Encoder class for transaction data in Python lists
 |  
 |  Parameters
 |  ------------
 |  None
 |  
 |  Attributes
 |  ------------
 |  columns_: list
 |    List of unique names in the `X` input list of lists
 |  
 |  Examples
 |  ------------
 |  For usage examples, please see
 |  http://rasbt.github.io/mlxtend/user_guide/preprocessing/TransactionEncoder/
 |  
 |  Method resolution order:
 |      TransactionEncoder
 |      sklearn.base.BaseEstimator
 |      sklearn.base.TransformerMixin
 |      sklearn.utils._set_output._SetOutputMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, X)
 |      Learn unique column names from transaction DataFrame
 |      
 |      Parameters
 |      ------------

#### transform the data to aymmetric binary encoding and load into a pandas data frame

In [7]:
# instantiate the mlxtend transaction encoder
te = TransactionEncoder()

# fit the mlxtend transaction encoder to the data set
te.fit(dataset)

# use the fitted mlxtend transaction encoder to transform the data set
te_ary = te.transform(dataset)

# load the transformed data set into a pandas data frame
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


#### By default, apriori returns the column indices of the items, which may be useful in downstream operations such as association rule mining. For better readability, we can set use_colnames=True to convert these integer values into the respective item names:

#### get the itemsets with at least 60% support (return column indices)

In [6]:

min_support = 0.6
apriori(df, min_support = min_support)

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


#### get the itemsets with at least 60% support (return column names)

In [9]:
min_support = 0.6
apriori(df, min_support = min_support, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Kidney Beans, Milk)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Yogurt, Kidney Beans)"


### example 2 - selecting and filtering results

In [8]:
# The advantage of working with pandas DataFrames is that we can use its convenient features to filter the results. 
# For instance, let's assume we are only interested in itemsets of length 2 that have a support of at least 80 percent. 
# First, we create the frequent itemsets via apriori and add a new column that stores the length of each itemset.

In [12]:
# get the frequent itemsets
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)

# create a column that lists the length of an itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# print(type(frequent_itemsets['itemsets'][0])) Note that the entries in the "itemsets" column are of type frozenset, it's immutable
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Kidney Beans, Eggs)",2
6,0.6,"(Onion, Eggs)",2
7,0.6,"(Kidney Beans, Milk)",2
8,0.6,"(Kidney Beans, Onion)",2
9,0.6,"(Yogurt, Kidney Beans)",2


In [10]:
# filter out itemsets of length 2 and support >= 0.80

frequent_itemsets[ (frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.8) ]

Unnamed: 0,support,itemsets,length
5,0.8,"(Eggs, Kidney Beans)",2


In [11]:
# filter out the itemset whose elements are Onion and Eggs 

frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

Unnamed: 0,support,itemsets,length
6,0.6,"(Eggs, Onion)",2


#### pandas frozensets type

In [12]:
# Note that the entries in the "itemsets" column are of type frozenset, which is built-in Python type that is similar 
# to a Python set but immutable, which makes it more efficient for certain query or comparison operations 
# (https://docs.python.org/3.6/library/stdtypes.html#frozenset). 

# Since frozensets are sets, the item order does not matter. i.e., the query

# frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

# is equivalent to any of the following three

#    frequent_itemsets[ frequent_itemsets['itemsets'] == {'Eggs', 'Onion'} ]
#    frequent_itemsets[ frequent_itemsets['itemsets'] == frozenset(('Eggs', 'Onion')) ]
#    frequent_itemsets[ frequent_itemsets['itemsets'] == frozenset(('Onion', 'Eggs')) ]

# Note that this slicing is case sensitive
#    frequent_itemsets[ frequent_itemsets['itemsets'] == frozenset(('onion', 'Eggs')) ] returns an empty data frame

### example 3 - working with sparse representations

In [13]:
# To save memory, you may want to represent your transaction data in the sparse format. This is especially useful if 
# you have lots of products and small transactions.

In [26]:
# fit and transform the instantiated transaction encoder as scipy sparse matrix
oht_ary = te.fit(dataset).transform(dataset, sparse=True)
print('oht_ary:\n', oht_ary, sep = '')
print('\ntype(oht_ary):', type(oht_ary))

# load the scipy sparse matrix into a pandas data frame
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
print('\ntype(sparse_df):', type(sparse_df))
print('\nsparse_df.dtypes:\n', sparse_df.dtypes, sep = '')

sparse_df

oht_ary:
  (0, 8)	True
  (0, 6)	True
  (0, 10)	True
  (0, 7)	True
  (0, 5)	True
  (0, 3)	True
  (1, 2)	True
  (1, 8)	True
  (1, 10)	True
  (1, 7)	True
  (1, 5)	True
  (1, 3)	True
  (2, 5)	True
  (2, 3)	True
  (2, 0)	True
  (2, 6)	True
  (3, 1)	True
  (3, 6)	True
  (3, 10)	True
  (3, 5)	True
  (3, 9)	True
  (4, 8)	True
  (4, 1)	True
  (4, 5)	True
  (4, 3)	True
  (4, 4)	True

type(oht_ary): <class 'scipy.sparse._csr.csr_matrix'>

type(sparse_df): <class 'pandas.core.frame.DataFrame'>

sparse_df.dtypes:
Apple           Sparse[bool, 0]
Corn            Sparse[bool, 0]
Dill            Sparse[bool, 0]
Eggs            Sparse[bool, 0]
Ice cream       Sparse[bool, 0]
Kidney Beans    Sparse[bool, 0]
Milk            Sparse[bool, 0]
Nutmeg          Sparse[bool, 0]
Onion           Sparse[bool, 0]
Unicorn         Sparse[bool, 0]
Yogurt          Sparse[bool, 0]
dtype: object


Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,0,0,0,1,0,True,1,1,1,0,1
1,0,0,1,1,0,True,0,1,1,0,1
2,1,0,0,1,0,True,1,0,0,0,0
3,0,1,0,0,0,True,1,0,0,1,1
4,0,1,0,1,1,True,0,0,1,0,0


In [20]:
oht_ary

<5x11 sparse matrix of type '<class 'numpy.bool_'>'
	with 26 stored elements in Compressed Sparse Row format>

In [15]:
# get the frequent itemsets with support >= 0.6 and return column names

min_support = 0.6
apriori(sparse_df, min_support = min_support, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Kidney Beans, Yogurt)"


## does mlxtend work on sets?

### let's put the data in a data frame format that uses sets and has a TID column

In [16]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
dataset

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [17]:
df_row_list = []
for i, a_list in enumerate(dataset):
    df_row_dict = {}
    a_set = set(a_list)
    df_row_dict['TID'] = i
    df_row_dict['Items'] = a_set
    df_row_list.append(df_row_dict)

trans_df = pd.DataFrame(df_row_list)
trans_df

Unnamed: 0,TID,Items
0,0,"{Milk, Kidney Beans, Yogurt, Eggs, Onion, Nutmeg}"
1,1,"{Kidney Beans, Yogurt, Eggs, Onion, Nutmeg, Dill}"
2,2,"{Milk, Eggs, Kidney Beans, Apple}"
3,3,"{Corn, Milk, Unicorn, Yogurt, Kidney Beans}"
4,4,"{Corn, Ice cream, Kidney Beans, Onion, Eggs}"


### now transform the data frame for use with the mlxtend library

In [18]:
# instantiate the transaction encoder
te = TransactionEncoder()

# encode the transactions and store them in a pandas data frame
te_ary = te.fit(trans_df.Items).transform(trans_df.Items)
transf_trans_df = pd.DataFrame(te_ary, columns=te.columns_)

# add a transaction id column and make it the first column
transf_trans_df['TID'] = range(transf_trans_df.shape[0])
columns = ['TID'] + [col for col in transf_trans_df.columns if col != 'TID']
transf_trans_df = transf_trans_df[columns]

# display the encoded transaction data frame
transf_trans_df

Unnamed: 0,TID,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,0,False,False,False,True,False,True,True,True,True,False,True
1,1,False,False,True,True,False,True,False,True,True,False,True
2,2,True,False,False,True,False,True,True,False,False,False,False
3,3,False,True,False,False,False,True,True,False,False,True,True
4,4,False,True,False,True,True,True,False,False,True,False,False


In [19]:
transf_trans_df.astype(int)

Unnamed: 0,TID,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,0,0,0,0,1,0,1,1,1,1,0,1
1,1,0,0,1,1,0,1,0,1,1,0,1
2,2,1,0,0,1,0,1,1,0,0,0,0
3,3,0,1,0,0,0,1,1,0,0,1,1
4,4,0,1,0,1,1,1,0,0,1,0,0


### now use the mlxtend library

#### get the frequent itemsets

In [20]:
# for parameter definitions
# https://github.com/rasbt/mlxtend/blob/master/mlxtend/frequent_patterns/apriori.py

min_support = 0.7
freq_itemsets = apriori(transf_trans_df.drop(columns=['TID']), min_support = min_support, use_colnames=True)
freq_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.8,"(Eggs, Kidney Beans)"


#### get association rules

In [21]:
# for parameter definitions
# https://github.com/rasbt/mlxtend/blob/master/mlxtend/frequent_patterns/association_rules.py

# For usage examples see
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [22]:
# Rule generation is a common task in the mining of frequent patterns. An association rule is an implication expression
# of the form X→Y, where X and Y are disjoint itemsets. A more concrete example based on consumer behaviour would be 
# {Diapers}→{Beer} suggesting that people who buy diapers are also likely to buy beer. To evaluate the "interest" of 
# such an association rule, different metrics have been developed. The current implementation make use of the 
# confidence and lift metrics. 

### generate association rules

In [23]:
min_threshold = 0.8
rules = association_rules(freq_itemsets, metric = "confidence", min_threshold = min_threshold)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
1,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
