### apriori 함수에 들어갈 트랜잭션 데이터를 만드는 함수

> from mlxtend.preprocessing import TransactionEncoder<br>
> te = TransactionEncoder()<br>
> te_result = te.fit(dataset).transform(dataset)

<br>


In [1]:
# # ## 설치 (시험장 설치버전 : 0.17.0)
# !conda config --add channels conda-forge
# !conda install mlxtend==0.17.0
# !pip install mlxtend==0.17.0

<br><b>[예제]</b>

dataset을 트랜젝션(거래) 형태로 변경하고 결과를 확인하라

In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

dataset = [['Apple', 'Beer', 'Rice', 'Chicken'],
           ['Apple', 'Beer', 'Rice'],
           ['Apple', 'Beer'],
           ['Apple', 'Bananas'],
           ['Milk', 'Beer', 'Rice', 'Chicken'],
           ['Milk', 'Beer', 'Rice'],
           ['Milk', 'Beer'],
           ['Apple', 'Bananas']]

te = TransactionEncoder()
te_ary = te.fit_transform(dataset)

print(te.columns_)
te_ary

['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice']


array([[ True, False,  True,  True, False,  True],
       [ True, False,  True, False, False,  True],
       [ True, False,  True, False, False, False],
       [ True,  True, False, False, False, False],
       [False, False,  True,  True,  True,  True],
       [False, False,  True, False,  True,  True],
       [False, False,  True, False,  True, False],
       [ True,  True, False, False, False, False]])

In [3]:
df = pd.DataFrame(te_ary,columns = te.columns_)
df

Unnamed: 0,Apple,Bananas,Beer,Chicken,Milk,Rice
0,True,False,True,True,False,True
1,True,False,True,False,False,True
2,True,False,True,False,False,False
3,True,True,False,False,False,False
4,False,False,True,True,True,True
5,False,False,True,False,True,True
6,False,False,True,False,True,False
7,True,True,False,False,False,False


<br><br>

###### mlxtend의 apriori 함수

One-Hot 형식의 DataFrame에서 빈번 항목 집합을 출력하는 함수

> from mlxtend.frequent_patterns import apriori<br>
> apriori(df)

- df : <span style='color: #CD5C5C'>값이 0/1 또는 True / False인 pandas DataFrame</span>
- min_support : 최소 지지도 (default = 0.5)
- use_colnames : True이면 열 인덱스 대신 반환된 DataFrame의 열 이름을 사용 (deault = False)
- max_len : 생성 된 항목 세트의 최대 길이로 None인 경우 가능한 모든 항목 세트 길이를 평가 (default = None)
- low_memory : 메모리 리소스가 제한된 경우에 사용할 수 있으나 속도가 기존의 3~6배 느림 (default = False)



In [4]:
## 지지도 60 % 이상인 품목만 추출 

from mlxtend.frequent_patterns import apriori

apriori(df, min_support=0.6)

Unnamed: 0,support,itemsets
0,0.625,(0)
1,0.75,(2)


In [5]:
apriori(df, min_support=0.6, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.625,(Apple)
1,0.75,(Beer)


In [6]:
## 규칙의 길이를 추출하는 방법 

frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.625,(Apple),1
1,0.75,(Beer),1
2,0.375,(Milk),1
3,0.5,(Rice),1
4,0.375,"(Beer, Apple)",2
5,0.375,"(Beer, Milk)",2
6,0.5,"(Beer, Rice)",2


In [7]:
frequent_itemsets['itemsets'][0]

frozenset({'Apple'})

In [8]:
len(frequent_itemsets['itemsets'][0])

1

<br>

<b>[예제]</b>

Groceries 데이터셋으로 연관규칙분석을 실시하라

In [9]:
df= pd.read_csv('./data/groceries.csv')
df

Unnamed: 0,"citrus fruit,semi-finished bread,margarine,ready soups"
0,"tropical fruit,yogurt,coffee"
1,whole milk
2,"pip fruit,yogurt,cream cheese,meat spreads"
3,"other vegetables,whole milk,condensed milk,lon..."
4,"whole milk,butter,yogurt,rice,abrasive cleaner"
...,...
9829,"sausage,chicken,beef,hamburger meat,citrus fru..."
9830,cooking chocolate
9831,"chicken,citrus fruit,other vegetables,butter,y..."
9832,"semi-finished bread,bottled water,soda,bottled..."


In [10]:
df.loc[9834, "citrus fruit,semi-finished bread,margarine,ready soups"] = df.columns.values[0]
df

Unnamed: 0,"citrus fruit,semi-finished bread,margarine,ready soups"
0,"tropical fruit,yogurt,coffee"
1,whole milk
2,"pip fruit,yogurt,cream cheese,meat spreads"
3,"other vegetables,whole milk,condensed milk,lon..."
4,"whole milk,butter,yogurt,rice,abrasive cleaner"
...,...
9830,cooking chocolate
9831,"chicken,citrus fruit,other vegetables,butter,y..."
9832,"semi-finished bread,bottled water,soda,bottled..."
9833,"chicken,tropical fruit,other vegetables,vinega..."


In [11]:
df_split = df.iloc[:,0].str.split(',',expand=True)
df_split

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
1,whole milk,,,,,,,,,,...,,,,,,,,,,
2,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
3,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,
4,whole milk,butter,yogurt,rice,abrasive cleaner,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,cooking chocolate,,,,,,,,,,...,,,,,,,,,,
9831,chicken,citrus fruit,other vegetables,butter,yogurt,frozen dessert,domestic eggs,rolls/buns,rum,cling film/bags,...,,,,,,,,,,
9832,semi-finished bread,bottled water,soda,bottled beer,,,,,,,...,,,,,,,,,,
9833,chicken,tropical fruit,other vegetables,vinegar,shopping bags,,,,,,...,,,,,,,,,,


In [12]:
df_split.values

array([['tropical fruit', 'yogurt', 'coffee', ..., None, None, None],
       ['whole milk', None, None, ..., None, None, None],
       ['pip fruit', 'yogurt', 'cream cheese', ..., None, None, None],
       ...,
       ['semi-finished bread', 'bottled water', 'soda', ..., None, None,
        None],
       ['chicken', 'tropical fruit', 'other vegetables', ..., None, None,
        None],
       ['citrus fruit', 'semi-finished bread', 'margarine', ..., None,
        None, None]], dtype=object)

In [13]:
df_split_ary = df_split.values

In [14]:
list(filter(None,df_split_ary[0]))

['tropical fruit', 'yogurt', 'coffee']

In [15]:
groceries = []
for i in range(len(df_split_ary)) : 
    temp = list(filter(None,df_split_ary[i]))
    groceries.append(temp)

In [None]:
groceries

In [17]:
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

te = TransactionEncoder()
groceries_tr = te.fit(groceries).transform(groceries)
groceries_tr = pd.DataFrame(groceries_tr, columns=te.columns_)
groceries_tr

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9832,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9833,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [18]:
from mlxtend.frequent_patterns import apriori

# 지지도가 5% 이상인 빈번 항목 집합 탐색
groceries_ap = apriori(groceries_tr, min_support=0.01, use_colnames=True)
groceries_ap.head(30)

Unnamed: 0,support,itemsets
0,0.033452,(UHT-milk)
1,0.017692,(baking powder)
2,0.052466,(beef)
3,0.033249,(berries)
4,0.026029,(beverages)
5,0.080529,(bottled beer)
6,0.110524,(bottled water)
7,0.06487,(brown bread)
8,0.055414,(butter)
9,0.027961,(butter milk)


<br>

###### mlxtend의 association_rules 함수

'score(점수)', 'confidence(신뢰도)' 및 'Lift(상승도)'를 포함하는 연관 규칙의 DataFrame 생성

> from mlxtend.frequent_patterns import association_rules<br>
> association_rules(df, metric='confidence', min_threshold=0.8, support_only=False)

- df : ['support', 'itemsets'] 열이 있는 빈번 항목 집합의 DataFrame
- metric : <span style='color: #CD5C5C'>{'support', 'confidence', 'lift'}</span><br>
&emsp; &emsp; &emsp; support_only = True 인 경우 자동으로 'support'로 설정됨
- min_threshold : metric에 지정된 rule의 최솟값 (default: 0.8)
- support_only : support만 계산하고 다른 메트릭 열을 NaN으로 출력

<br>

In [19]:
from mlxtend.frequent_patterns import association_rules

# 신뢰도가 0.3 이상인 빈번 항목 집합 탐색
association_rules(groceries_ap, metric="confidence", min_threshold=0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(beef),(other vegetables),0.052466,0.193493,0.019725,0.375969,1.943066,0.009574,1.292416
1,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628
2,(beef),(whole milk),0.052466,0.255516,0.021251,0.405039,1.585180,0.007845,1.251315
3,(berries),(other vegetables),0.033249,0.193493,0.010269,0.308869,1.596280,0.003836,1.166938
4,(berries),(whole milk),0.033249,0.255516,0.011795,0.354740,1.388328,0.003299,1.153774
...,...,...,...,...,...,...,...,...,...
120,"(soda, yogurt)",(whole milk),0.027351,0.255516,0.010473,0.382900,1.498535,0.003484,1.206423
121,"(yogurt, tropical fruit)",(whole milk),0.029283,0.255516,0.015150,0.517361,2.024770,0.007668,1.542528
122,"(whole milk, tropical fruit)",(yogurt),0.042298,0.139502,0.015150,0.358173,2.567516,0.009249,1.340701
123,"(yogurt, whipped/sour cream)",(whole milk),0.020742,0.255516,0.010880,0.524510,2.052747,0.005580,1.565719


<br><br>

[예시] 여러 개의 기준을 충족하는 빈번 항목 집합 탐색

In [20]:
rules = association_rules(groceries_ap, metric="lift", min_threshold=3)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(root vegetables),(beef),0.108998,0.052466,0.017387,0.159515,3.040367,0.011668,1.127366,1
1,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628,1
2,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059,2
3,"(other vegetables, citrus fruit)",(root vegetables),0.028876,0.108998,0.010371,0.359155,3.295045,0.007224,1.390354,2
4,(root vegetables),"(other vegetables, citrus fruit)",0.108998,0.028876,0.010371,0.095149,3.295045,0.007224,1.073242,1
5,(other vegetables),"(root vegetables, citrus fruit)",0.193493,0.017692,0.010371,0.0536,3.029608,0.006948,1.037941,1
6,"(whole milk, yogurt)",(curd),0.056024,0.053279,0.010066,0.179673,3.372304,0.007081,1.154078,2
7,(curd),"(whole milk, yogurt)",0.053279,0.056024,0.010066,0.188931,3.372304,0.007081,1.163866,1
8,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244,2
9,"(other vegetables, tropical fruit)",(root vegetables),0.035892,0.108998,0.012303,0.342776,3.14478,0.008391,1.355705,2


In [21]:
rules[ (rules['antecedent_len'] >= 1) &
       (rules['confidence'] > 0.4) &
       (rules['lift'] > 1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
2,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059,2
8,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244,2
