# Apriori: Association Rule Mining, ARM

mlxtend(machine learning extensions )
first, pip install mlxtend


In [1]:
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

transaction data
Onion洋葱   Nutmeg肉豆蔻   Kidney Beans芸豆    Yogurt酸奶   Dill莳萝  Unicorn独角兽  Corn玉米

In [2]:
dataset=[['Milk','Onion','Nutmeg','Kidney Beans','Eggs','Yogurt'],
         ['Dill','Onion','Nutmeg','Kidney Beans','Eggs','Yogurt'],
         ['Milk','Apple','Kidney Beans','Eggs'],
         ['Milk','Unicorn','Corn','Kidney Beans','Yogurt'],
         ['Corn','Onion','Onion','Kidney Beans','Ice cream','Eggs']]

用 Transaction Encoder 转换为 Array 格式
apriori function 要求 data 使用 pandas DataFrame格式，如果 raw data 是其他格式，
則需要用 Transaction Encoder 的 fit 和 transform 转换。Transaction Encoder 可以从dataset中learns unique items，并将每一个 transaction ( Python 的 List ) 转变成独热编码布尔数组(
one-hot encoded boolean Numpy array)。

In [3]:
te = TransactionEncoder()
dataset_te = te.fit(dataset).transform(dataset)

In [4]:
dataset_te

array([[False, False, False,  True, False,  True,  True,  True,  True,
        False,  True],
       [False, False,  True,  True, False,  True, False,  True,  True,
        False,  True],
       [ True, False, False,  True, False,  True,  True, False, False,
        False, False],
       [False,  True, False, False, False,  True,  True, False, False,
         True,  True],
       [False,  True, False,  True,  True,  True, False, False,  True,
        False, False]])

transform 完的 dataset 变成 NumPy array 格式，只有 True 和 False (boolean) ，这对处理大型datasets 时更有效率。

In [5]:
te.columns_

['Apple',
 'Corn',
 'Dill',
 'Eggs',
 'Ice cream',
 'Kidney Beans',
 'Milk',
 'Nutmeg',
 'Onion',
 'Unicorn',
 'Yogurt']

In [6]:
dataset_df = pd.DataFrame(dataset_te,columns = te.columns_)

还原：假如我们想将 one-hot encoded array 还原为 transaction list，则可以用 inverse_Transform function：



In [7]:
first3 = dataset_te[:3]
te.inverse_transform(first3)

[['Eggs', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Yogurt'],
 ['Dill', 'Eggs', 'Kidney Beans', 'Nutmeg', 'Onion', 'Yogurt'],
 ['Apple', 'Eggs', 'Kidney Beans', 'Milk']]

apriori进行频繁项集挖掘

In [8]:
frequent_itemsets = apriori(dataset_df,min_support=0.6,use_colnames =True)

In [9]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Kidney Beans, Milk)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Kidney Beans, Yogurt)"


In [10]:
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Kidney Beans, Eggs)",2
6,0.6,"(Onion, Eggs)",2
7,0.6,"(Kidney Beans, Milk)",2
8,0.6,"(Kidney Beans, Onion)",2
9,0.6,"(Kidney Beans, Yogurt)",2


In [11]:
frequent_itemsets[ (frequent_itemsets['length']==2) & 
                  (frequent_itemsets['support']>=0.8)]

Unnamed: 0,support,itemsets,length
5,0.8,"(Kidney Beans, Eggs)",2


In [None]:
%timeit  apriori(df,min_support=0.6)

从frequent_itemsets中产生关联规则

support(A→C)=support(A∪C),range: [0,1]
confidence(A→C)=support(A→C)/support(A),range: [0,1]
lift(A→C)=confidence(A→C)/support(C),range: [0,∞]
levarage(A→C)=support(A→C)−support(A)×support(C),range: [−1,1]
conviction(A→C)=(1−support(C))/(1−confidence(A→C)),range: [0,∞]

In [12]:
res = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.7)

In [13]:
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
1,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
2,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
5,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
6,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,"(Kidney Beans, Onion)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
8,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
9,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf


In [14]:
res0 = association_rules(frequent_itemsets, metric="lift",min_threshold=1.2)

In [15]:
res0

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,"(Kidney Beans, Onion)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Onion),"(Kidney Beans, Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf
5,(Eggs),"(Kidney Beans, Onion)",0.8,0.6,0.6,0.75,1.25,0.12,1.6


In [16]:
res1=res[['antecedents','consequents','support','confidence','lift']]

In [17]:
res1

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(Kidney Beans),(Eggs),0.8,0.8,1.0
1,(Eggs),(Kidney Beans),0.8,1.0,1.0
2,(Onion),(Eggs),0.6,1.0,1.25
3,(Eggs),(Onion),0.6,0.75,1.25
4,(Milk),(Kidney Beans),0.6,1.0,1.0
5,(Onion),(Kidney Beans),0.6,1.0,1.0
6,(Yogurt),(Kidney Beans),0.6,1.0,1.0
7,"(Kidney Beans, Onion)",(Eggs),0.6,1.0,1.25
8,"(Kidney Beans, Eggs)",(Onion),0.6,0.75,1.25
9,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,1.0


In [18]:
res2=res1[res1['confidence']>=0.9]

In [19]:
res2

Unnamed: 0,antecedents,consequents,support,confidence,lift
1,(Eggs),(Kidney Beans),0.8,1.0,1.0
2,(Onion),(Eggs),0.6,1.0,1.25
4,(Milk),(Kidney Beans),0.6,1.0,1.0
5,(Onion),(Kidney Beans),0.6,1.0,1.0
6,(Yogurt),(Kidney Beans),0.6,1.0,1.0
7,"(Kidney Beans, Onion)",(Eggs),0.6,1.0,1.25
9,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,1.0
10,(Onion),"(Kidney Beans, Eggs)",0.6,1.0,1.25


# example from Jiawei Han 例5-3

In [20]:
transactions = [['I1','I2','I5'],
               ['I2','I4'],
               ['I2','I3'],
               ['I1','I2','I4'],
               ['I1','I3'],
               ['I2','I3'],
               ['I1','I3'],
               ['I1','I2','I3','I5'],
               ['I1','I2','I3']]

In [21]:
te = TransactionEncoder()
transactions_te = te.fit(transactions).transform(transactions)
transactions_df = pd.DataFrame(transactions_te,columns = te.columns_)

In [22]:
transactions_df

Unnamed: 0,I1,I2,I3,I4,I5
0,True,True,False,False,True
1,False,True,False,True,False
2,False,True,True,False,False
3,True,True,False,True,False
4,True,False,True,False,False
5,False,True,True,False,False
6,True,False,True,False,False
7,True,True,True,False,True
8,True,True,True,False,False


In [28]:
#frequent_itemsets = apriori(transactions_df,min_support=0.6,use_colnames =True)
frequent_itemsets = apriori(transactions_df,min_support=0.22,use_colnames =True)
#use_colnames代表使用元素名字，如果默认false代表使用列序号

In [29]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(I1)
1,0.777778,(I2)
2,0.666667,(I3)
3,0.222222,(I4)
4,0.222222,(I5)
5,0.444444,"(I1, I2)"
6,0.444444,"(I3, I1)"
7,0.222222,"(I5, I1)"
8,0.444444,"(I3, I2)"
9,0.222222,"(I4, I2)"


In [30]:
frequent_itemsets['length']=frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.666667,(I1),1
1,0.777778,(I2),1
2,0.666667,(I3),1
3,0.222222,(I4),1
4,0.222222,(I5),1
5,0.444444,"(I1, I2)",2
6,0.444444,"(I3, I1)",2
7,0.222222,"(I5, I1)",2
8,0.444444,"(I3, I2)",2
9,0.222222,"(I4, I2)",2


In [32]:
frequent_itemsets[ (frequent_itemsets['length']> 2) & 
                  (frequent_itemsets['support']>=0.22)]


Unnamed: 0,support,itemsets,length
11,0.222222,"(I3, I1, I2)",3
12,0.222222,"(I5, I1, I2)",3


In [33]:
res = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.7)
#res = association_rules(frequent_itemsets, metric="confidence",min_threshold=0.5,support_only=True)
#res = association_rules(frequent_itemsets, support_only=True,min_threshold=0.1)

In [34]:
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(I5),(I1),0.222222,0.666667,0.222222,1.0,1.5,0.074074,inf
1,(I4),(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
2,(I5),(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
3,"(I5, I1)",(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
4,"(I5, I2)",(I1),0.222222,0.666667,0.222222,1.0,1.5,0.074074,inf
5,(I5),"(I1, I2)",0.222222,0.444444,0.222222,1.0,2.25,0.123457,inf


In [35]:
res1=res[['antecedents','consequents','support','confidence','lift']]


In [36]:
res1

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(I5),(I1),0.222222,1.0,1.5
1,(I4),(I2),0.222222,1.0,1.285714
2,(I5),(I2),0.222222,1.0,1.285714
3,"(I5, I1)",(I2),0.222222,1.0,1.285714
4,"(I5, I2)",(I1),0.222222,1.0,1.5
5,(I5),"(I1, I2)",0.222222,1.0,2.25


In [37]:
res2=res1[res1['lift']>=1]

In [38]:
res2

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(I5),(I1),0.222222,1.0,1.5
1,(I4),(I2),0.222222,1.0,1.285714
2,(I5),(I2),0.222222,1.0,1.285714
3,"(I5, I1)",(I2),0.222222,1.0,1.285714
4,"(I5, I2)",(I1),0.222222,1.0,1.5
5,(I5),"(I1, I2)",0.222222,1.0,2.25
