# 연관규칙: 빈발 항목집합 - A-Priori Algorithm

In [1]:
!pip install mlxtend # Machine Learning Extension

Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.22.0


In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

##### transaction data

In [3]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [4]:
print(type(dataset))

<class 'list'>


In [5]:
te = TransactionEncoder() # encoder 함수를 변수로 저장
te_ary = te.fit(dataset).transform(dataset)
print(type(te_ary), '\n')
print(te_ary)

# 11 items

#  0       1    2     3        4           5          6      7      8       9       10          
# Apple, Corn, Dill, Eggs, Ice cream, Kidney Beans, Milk, Nutmeg, Onion, Unicorn, Yogurt 

<class 'numpy.ndarray'> 

[[False False False  True False  True  True  True  True False  True]
 [False False  True  True False  True False  True  True False  True]
 [ True False False  True False  True  True False False False False]
 [False  True False False False  True  True False False  True  True]
 [False  True False  True  True  True False False  True False False]]


In [6]:
print(te_ary.astype("int")) #논리형을 0,1로 바꿈

[[0 0 0 1 0 1 1 1 1 0 1]
 [0 0 1 1 0 1 0 1 1 0 1]
 [1 0 0 1 0 1 1 0 0 0 0]
 [0 1 0 0 0 1 1 0 0 1 1]
 [0 1 0 1 1 1 0 0 1 0 0]]


In [7]:
print(te.columns_)

['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Unicorn', 'Yogurt']


In [8]:
df = pd.DataFrame(te_ary, columns=te.columns_)
print(type(df))
print(df)

<class 'pandas.core.frame.DataFrame'>
   Apple   Corn   Dill   Eggs  Ice cream  Kidney Beans   Milk  Nutmeg  Onion  \
0  False  False  False   True      False          True   True    True   True   
1  False  False   True   True      False          True  False    True   True   
2   True  False  False   True      False          True   True   False  False   
3  False   True  False  False      False          True   True   False  False   
4  False   True  False   True       True          True  False   False   True   

   Unicorn  Yogurt  
0    False    True  
1    False    True  
2    False   False  
3     True    True  
4    False   False  


### A-Priori Algorithm 적용하기

##### 지지도(support)가 0.6 이상인 빈발 항목집합을 찾아봅시다. 

In [9]:
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
#frequent_itemsets = fpgrowth(df, min_support=0.6, use_colnames=True)
#frequent_itemsets = fpmax(df, min_support=0.6, use_colnames=True)

In [10]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [11]:
from mlxtend.frequent_patterns import association_rules

##### 신뢰도(confidence)가 0.7 이상인 연관규칙을 찾아봅시다. 

In [12]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0,0.0
1,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf,0.0
2,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
3,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
4,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
5,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
6,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
7,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
8,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
9,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0


##### 향상도(lift)가 1.2 이상인 연관규칙을 찾아봅시다.

In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
3,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
4,(Onion),"(Kidney Beans, Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
5,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0


In [14]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5,1
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0,1
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5,2
3,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0,2
4,(Onion),"(Kidney Beans, Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5,1
5,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0,1


##### 바구니에 두 개 이상의 항목들이 있고, 신뢰도(confidence)가 0.75 이상이고, 향상도(lift)가 1.2 이상인 연관규칙을 찾아봅시다.

In [15]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5,2


##### 연관규칙 테이블의 칼럼명 'antecedents'의 값이 {'Egg', 'Kidney Beans'}인 연관규칙을 찾아봅시다.

In [16]:
rules[rules['antecedents'] == {'Eggs', 'Kidney Beans'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
3,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0,2
