In [None]:
!pip install --upgrade mlxtend

In [1]:
import mlxtend
import numpy as np
import pandas as pd

In [2]:
data = np.array([
                 ['우유', '기저귀', '쥬스'], # 0번 거래
                 ['양상추', '기저귀', '맥주'], # 1번 거래
                 ['우유', '양상추', '기저귀', '맥주'], # 2번 거래
                 ['양상추', '맥주'] # 3번 거래
], dtype = 'object')

# **Apriori**

TransactionEncoder는 fit한 데이터에 대해 사전순으로 데이터 유니크 값을 0부터 순서대로 맵핑한다.<br>

이후 transform을 진행하면 맵핑한 순서대로 컬럼을 만들며, transform에 사용된 데이터를 확인하여 컬럼에 해당하는 값이 있으면 True 아니면 False를 반환하여 row는 transform에 사용된 데이터의 row, column은 fit에 사용된 데이터의 유니크한 값의 수를 갖는 행렬(Sparse Matrix: 희소행렬)을 만든다.<br>

이때, fit에 사용한 데이터에 존재하지 않는 유니크값은 tranform할 때 에러가 발생하니 주의.

In [3]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_arr = te.fit_transform(data)
te.fit(data).columns_mapping_, te_arr

({'기저귀': 0, '맥주': 1, '양상추': 2, '우유': 3, '쥬스': 4},
 array([[ True, False, False,  True,  True],
        [ True,  True,  True, False, False],
        [ True,  True,  True,  True, False],
        [False,  True,  True, False, False]]))

In [4]:
te_arr_df = pd.DataFrame(te_arr, columns=te.columns_)
te_arr_df

Unnamed: 0,기저귀,맥주,양상추,우유,쥬스
0,True,False,False,True,True
1,True,True,True,False,False
2,True,True,True,True,False
3,False,True,True,False,False


In [52]:
%%time
from mlxtend.frequent_patterns import apriori
apri = apriori(te_arr_df, min_support=0.5, use_colnames=True, verbose=1)
apri = apri.sort_values(by=['support', 'itemsets'], ascending=False)
apri.reset_index(drop = True, inplace = True)

Processing 12 combinations | Sampling itemset size 2Processing 12 combinations | Sampling itemset size 3
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 14.2 ms


In [53]:
apri

Unnamed: 0,support,itemsets
0,0.75,"(양상추, 맥주)"
1,0.75,(양상추)
2,0.75,(맥주)
3,0.75,(기저귀)
4,0.5,"(맥주, 기저귀, 양상추)"
5,0.5,"(우유, 기저귀)"
6,0.5,"(양상추, 기저귀)"
7,0.5,"(맥주, 기저귀)"
8,0.5,(우유)


# **FP-Growth**

In [54]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_fit = te.fit(data)
te_arr = te_fit.transform(data)
te_arr_df = pd.DataFrame(te_arr, columns=te_fit.columns_)
te_arr_df

Unnamed: 0,기저귀,맥주,양상추,우유,쥬스
0,True,False,False,True,True
1,True,True,True,False,False
2,True,True,True,True,False
3,False,True,True,False,False


In [55]:
%%time
from mlxtend.frequent_patterns import fpgrowth
fpgrow = fpgrowth(te_arr_df, min_support = .5, use_colnames=True, verbose=1)

4 itemset(s) from tree conditioned on items ()
3 itemset(s) from tree conditioned on items (기저귀)
1 itemset(s) from tree conditioned on items (우유)
0 itemset(s) from tree conditioned on items (양상추)
1 itemset(s) from tree conditioned on items (맥주)
CPU times: user 3.51 ms, sys: 33 µs, total: 3.54 ms
Wall time: 3.41 ms


In [56]:
fpgrow = fpgrow.sort_values(by=['support', 'itemsets'], ascending=False)
fpgrow.reset_index(drop = True, inplace = True)
fpgrow

Unnamed: 0,support,itemsets
0,0.75,"(양상추, 맥주)"
1,0.75,(맥주)
2,0.75,(양상추)
3,0.75,(기저귀)
4,0.5,"(우유, 기저귀)"
5,0.5,"(맥주, 기저귀, 양상추)"
6,0.5,"(양상추, 기저귀)"
7,0.5,"(맥주, 기저귀)"
8,0.5,(우유)


In [57]:
from mlxtend.frequent_patterns import association_rules
association_rules(fpgrow, metric="confidence", min_threshold=.5, support_only=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(양상추),(맥주),0.75,0.75,0.75,1.0,1.333333,0.1875,inf
1,(맥주),(양상추),0.75,0.75,0.75,1.0,1.333333,0.1875,inf
2,(우유),(기저귀),0.5,0.75,0.5,1.0,1.333333,0.125,inf
3,(기저귀),(우유),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
4,"(기저귀, 맥주)",(양상추),0.5,0.75,0.5,1.0,1.333333,0.125,inf
5,"(양상추, 맥주)",(기저귀),0.75,0.75,0.5,0.666667,0.888889,-0.0625,0.75
6,"(양상추, 기저귀)",(맥주),0.5,0.75,0.5,1.0,1.333333,0.125,inf
7,(맥주),"(양상추, 기저귀)",0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
8,(기저귀),"(양상추, 맥주)",0.75,0.75,0.5,0.666667,0.888889,-0.0625,0.75
9,(양상추),"(기저귀, 맥주)",0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
