In [35]:
# mlxtend 모듈을 이용해 분석

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [36]:
dataset=[['사과','치즈','생수'],
         ['생수','호두','치즈','고등어'],
         ['수박','사과','생수'],
         ['생수','호두','치즈','옥수수']]

In [37]:
tren = TransactionEncoder()
tren_ary = tren.fit(dataset).transform(dataset)
print(tren.columns_)

['고등어', '사과', '생수', '수박', '옥수수', '치즈', '호두']


In [38]:
print(tren_ary)

[[False  True  True False False  True False]
 [ True False  True False False  True  True]
 [False  True  True  True False False False]
 [False False  True False  True  True  True]]


In [39]:
df = pd.DataFrame(tren_ary, columns = tren.columns_)
df

Unnamed: 0,고등어,사과,생수,수박,옥수수,치즈,호두
0,False,True,True,False,False,True,False
1,True,False,True,False,False,True,True
2,False,True,True,True,False,False,False
3,False,False,True,False,True,True,True


## apriori: 지지도 계산

In [40]:
print(tren.columns_)
fre_items = apriori(df, min_support=0.01)
fre_items

['고등어', '사과', '생수', '수박', '옥수수', '치즈', '호두']


Unnamed: 0,support,itemsets
0,0.25,(0)
1,0.5,(1)
2,1.0,(2)
3,0.25,(3)
4,0.25,(4)
5,0.75,(5)
6,0.5,(6)
7,0.25,"(0, 2)"
8,0.25,"(0, 5)"
9,0.25,"(0, 6)"


In [41]:
fre_items = apriori(df, min_support=0.01, use_colnames=True)
fre_items

Unnamed: 0,support,itemsets
0,0.25,(고등어)
1,0.5,(사과)
2,1.0,(생수)
3,0.25,(수박)
4,0.25,(옥수수)
5,0.75,(치즈)
6,0.5,(호두)
7,0.25,"(생수, 고등어)"
8,0.25,"(고등어, 치즈)"
9,0.25,"(고등어, 호두)"


#### 람다식(lambda)
- lambda 매개변수: 표현식
- df.apply() 함수와 같이 사용
- map(함수, 리스트)

In [42]:
def ss(a, b):
    return a + b

In [43]:
ss(5,2)

7

In [44]:
# 람다 표현
(lambda a, b: a + b)(5, 2)

7

In [45]:
list(map(lambda x: x**2, range(5)))

[0, 1, 4, 9, 16]

In [46]:
# df.apply()을 이용해 fre_items에 "length" 필드 추가
fre_items['length'] = fre_items['itemsets'].apply(lambda x: len(x))
fre_items.tail()

Unnamed: 0,support,itemsets,length
26,0.25,"(생수, 호두, 옥수수)",3
27,0.5,"(생수, 치즈, 호두)",3
28,0.25,"(치즈, 호두, 옥수수)",3
29,0.25,"(생수, 호두, 고등어, 치즈)",4
30,0.25,"(생수, 치즈, 호두, 옥수수)",4


In [47]:
## support가 0.3이상, length가 2 이상인 데이터만 추출 => fre_items에 저장
# 결과값은 support 내림차순 정렬
fre_items = fre_items[(fre_items['support'] >= 0.3) & (fre_items['length'] >= 2)]
fre_items.sort_values(by = 'support', ascending = False)
fre_items


Unnamed: 0,support,itemsets,length
10,0.5,"(사과, 생수)",2
15,0.75,"(생수, 치즈)",2
16,0.5,"(생수, 호두)",2
19,0.5,"(치즈, 호두)",2
27,0.5,"(생수, 치즈, 호두)",3


In [49]:
# apriori 함수를 이용한 지지도/신뢰도/향상도 확인
from mlxtend.frequent_patterns import association_rules

fre_items=apriori(df, min_support = 0.3)
association_rules(fre_items, min_threshold = 0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1),(2),0.5,1.0,0.5,1.0,1.0,0.0,inf
1,(2),(1),1.0,0.5,0.5,0.5,1.0,0.0,1.0
2,(2),(5),1.0,0.75,0.75,0.75,1.0,0.0,1.0
3,(5),(2),0.75,1.0,0.75,1.0,1.0,0.0,inf
4,(2),(6),1.0,0.5,0.5,0.5,1.0,0.0,1.0
5,(6),(2),0.5,1.0,0.5,1.0,1.0,0.0,inf
6,(5),(6),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
7,(6),(5),0.5,0.75,0.5,1.0,1.333333,0.125,inf
8,"(2, 5)",(6),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
9,"(2, 6)",(5),0.5,0.75,0.5,1.0,1.333333,0.125,inf


In [50]:
association_rules(fre_items, metric="lift",min_threshold=1.3)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(5),(6),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
1,(6),(5),0.5,0.75,0.5,1.0,1.333333,0.125,inf
2,"(2, 5)",(6),0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
3,"(2, 6)",(5),0.5,0.75,0.5,1.0,1.333333,0.125,inf
4,(5),"(2, 6)",0.75,0.5,0.5,0.666667,1.333333,0.125,1.5
5,(6),"(2, 5)",0.5,0.75,0.5,1.0,1.333333,0.125,inf
