<a href="https://colab.research.google.com/github/ykjoy/mining/blob/main/association.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 연관성 분석 (Association Rule)


## 스키용품 구매 

In [1]:
! pip install mlxtend

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
import numpy as np

In [6]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

dataset = [['보드데크', '부츠', '보호대'], ['부츠', '고글'], ['부츠', '바인딩', '보드데크'], ['보드데크', '부츠', '고글'], ['보드데크', '바인딩'], ['부츠', '바인딩'], ['보드데크', '바인딩'], ['보드데크', '바인딩', '부츠', '보호대'], ['보드데크', '바인딩', '부츠']]

encoder = TransactionEncoder()
np_array = encoder.fit(dataset).transform(dataset)
print(np_array)
df = pd.DataFrame(np_array, columns = encoder.columns_)
display(df)
           

[[False False  True  True  True]
 [ True False False False  True]
 [False  True  True False  True]
 [ True False  True False  True]
 [False  True  True False False]
 [False  True False False  True]
 [False  True  True False False]
 [False  True  True  True  True]
 [False  True  True False  True]]


Unnamed: 0,고글,바인딩,보드데크,보호대,부츠
0,False,False,True,True,True
1,True,False,False,False,True
2,False,True,True,False,True
3,True,False,True,False,True
4,False,True,True,False,False
5,False,True,False,False,True
6,False,True,True,False,False
7,False,True,True,True,True
8,False,True,True,False,True


In [7]:
frequent_itemsets = apriori(df, min_support = 0.35, use_colnames = True)
display(frequent_itemsets)

Unnamed: 0,support,itemsets
0,0.666667,(바인딩)
1,0.777778,(보드데크)
2,0.777778,(부츠)
3,0.555556,"(바인딩, 보드데크)"
4,0.444444,"(바인딩, 부츠)"
5,0.555556,"(부츠, 보드데크)"


In [8]:
association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.7)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(바인딩),(보드데크),0.666667,0.777778,0.555556,0.833333,1.071429,0.037037,1.333333
1,(보드데크),(바인딩),0.777778,0.666667,0.555556,0.714286,1.071429,0.037037,1.166667
2,(부츠),(보드데크),0.777778,0.777778,0.555556,0.714286,0.918367,-0.049383,0.777778
3,(보드데크),(부츠),0.777778,0.777778,0.555556,0.714286,0.918367,-0.049383,0.777778


In [9]:
association_rules(frequent_itemsets, metric = 'lift', min_threshold = 1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(바인딩),(보드데크),0.666667,0.777778,0.555556,0.833333,1.071429,0.037037,1.333333
1,(보드데크),(바인딩),0.777778,0.666667,0.555556,0.714286,1.071429,0.037037,1.166667


## 식료품 구매

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
 
 #0/1,  False/True모두 가능
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ITB/asso_product.csv")
display(df.head())

Unnamed: 0,herring,corned_b,olives,ham,cracker,artichok,heineken,turkey,coke,avocado,apples,peppers,baguette,chicken,bourbon,sardines,soda
0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0
4,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [12]:
frequent_itemsets = apriori(df, min_support = 0.05, use_colnames = True)
display(frequent_itemsets.head(3))

# itemset의 물품수 제한을 두고 싶을 때 
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))  #물품 수를 추가
display(frequent_itemsets.tail(3))  #뒤에를 찍어야 몇개의 물품까지 나오는지 확인됨.
print(frequent_itemsets.shape)

# item set의 물품수를 2개로 제한하고, support를 0.1이상 0.9이하로 설정하여 frequent itemset내 물품을 줄임
frequent_itemsets = frequent_itemsets[ (frequent_itemsets['length'] <=2) & (frequent_itemsets['support'] <= 0.9) ]
print(frequent_itemsets.shape)
display(frequent_itemsets.tail(3))


Unnamed: 0,support,itemsets
0,0.383838,(herring)
1,0.313131,(corned_b)
2,0.444444,(olives)


Unnamed: 0,support,itemsets,length
115,0.080808,"(cracker, artichok, avocado, heineken)",4
116,0.090909,"(sardines, coke, chicken, heineken)",4
117,0.090909,"(sardines, baguette, apples, peppers)",4


(118, 3)
(69, 3)


Unnamed: 0,support,itemsets,length
66,0.090909,"(baguette, soda)",2
67,0.090909,"(sardines, chicken)",2
68,0.141414,"(bourbon, soda)",2


In [13]:
association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.8 )

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ham),(herring),0.111111,0.383838,0.111111,1.0,2.605263,0.068462,inf
1,(ham),(corned_b),0.111111,0.313131,0.111111,1.0,3.193548,0.076319,inf
2,(ham),(olives),0.111111,0.444444,0.111111,1.0,2.25,0.061728,inf
3,(turkey),(olives),0.10101,0.444444,0.10101,1.0,2.25,0.056117,inf
4,(soda),(cracker),0.232323,0.424242,0.232323,1.0,2.357143,0.133762,inf
5,(artichok),(avocado),0.171717,0.171717,0.171717,1.0,5.823529,0.14223,inf
6,(avocado),(artichok),0.171717,0.171717,0.171717,1.0,5.823529,0.14223,inf
7,(chicken),(heineken),0.090909,0.171717,0.090909,1.0,5.823529,0.075298,inf
8,(turkey),(coke),0.10101,0.191919,0.10101,1.0,5.210526,0.081624,inf
9,(turkey),(bourbon),0.10101,0.353535,0.10101,1.0,2.828571,0.065299,inf


In [14]:
association_rules(frequent_itemsets, metric = 'lift', min_threshold = 5.0 )

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(artichok),(avocado),0.171717,0.171717,0.171717,1.0,5.823529,0.14223,inf
1,(avocado),(artichok),0.171717,0.171717,0.171717,1.0,5.823529,0.14223,inf
2,(chicken),(heineken),0.090909,0.171717,0.090909,1.0,5.823529,0.075298,inf
3,(heineken),(chicken),0.171717,0.090909,0.090909,0.529412,5.823529,0.075298,1.931818
4,(coke),(turkey),0.191919,0.10101,0.10101,0.526316,5.210526,0.081624,1.897868
5,(turkey),(coke),0.10101,0.191919,0.10101,1.0,5.210526,0.081624,inf
6,(coke),(chicken),0.191919,0.090909,0.090909,0.473684,5.210526,0.073462,1.727273
7,(chicken),(coke),0.090909,0.191919,0.090909,1.0,5.210526,0.073462,inf
8,(sardines),(chicken),0.181818,0.090909,0.090909,0.5,5.5,0.07438,1.818182
9,(chicken),(sardines),0.090909,0.181818,0.090909,1.0,5.5,0.07438,inf


## 화장품 구매 

In [15]:
import pandas as pd
 
 #0/1,  False/True모두 가능
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ITB/cosmetic.csv")
display(df.head())
print(df.shape)

Unnamed: 0,Trans. #,Bag,Blush,Nail Polish,Brushes,Concealer,Eyebrow Pencils,Bronzer,Lip liner,Mascara,Eye shadow,Foundation,Lip Gloss,Lipstick,Eyeliner
0,1,,T,T,T,T,,T,T,T,,,,,T
1,2,,,T,,T,,T,T,,,T,T,,
2,3,,T,,,T,T,T,T,T,T,T,T,T,
3,4,,,T,T,T,,T,,,,T,,,T
4,5,,T,,,T,,T,T,T,T,,T,T,


(1000, 15)


In [16]:
# 데이터 전처리 (거래가 아닌 첫 열 제거, true/false로 변환)
df = df.iloc[:, 1:]
print(df.shape)
df = df.replace('T', True)  #string이 아닌 boolean이나 0/1로 변환

df.fillna(False, inplace = True)  #결측치 NaN으로 표시된 것을 False라고 변경. df를 덥어씀 inplace = True
display(df.head())

(1000, 14)


Unnamed: 0,Bag,Blush,Nail Polish,Brushes,Concealer,Eyebrow Pencils,Bronzer,Lip liner,Mascara,Eye shadow,Foundation,Lip Gloss,Lipstick,Eyeliner
0,False,True,True,True,True,False,True,True,True,False,False,False,False,True
1,False,False,True,False,True,False,True,True,False,False,True,True,False,False
2,False,True,False,False,True,True,True,True,True,True,True,True,True,False
3,False,False,True,True,True,False,True,False,False,False,True,False,False,True
4,False,True,False,False,True,False,True,True,True,True,False,True,True,False


In [17]:
# frequent itemset생성
frequent_itemsets = apriori(df, min_support = 0.1, use_colnames=True)
display(frequent_itemsets)


Unnamed: 0,support,itemsets
0,0.363,(Blush)
1,0.280,(Nail Polish)
2,0.149,(Brushes)
3,0.442,(Concealer)
4,0.279,(Bronzer)
...,...,...
97,0.116,"(Foundation, Lipstick, Lip Gloss)"
98,0.156,"(Foundation, Lip Gloss, Eyeliner)"
99,0.119,"(Blush, Concealer, Mascara, Eye shadow)"
100,0.114,"(Concealer, Mascara, Eye shadow, Eyeliner)"


In [18]:
#frequent_itemsets 물품을 2개로 한정
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))  #물품 수를 추가
display(frequent_itemsets.tail(3))  #뒤에를 찍어야 몇개의 물품까지 나오는지 확인됨.

# item set의 물품수를 2개로 제한하고, support를 0.1이상 0.8이하로 설정하여 frequent itemset내 물품을 줄임
frequent_itemsets = frequent_itemsets[ (frequent_itemsets['length'] <=2) ]
print(frequent_itemsets.shape)

Unnamed: 0,support,itemsets,length
99,0.119,"(Blush, Concealer, Mascara, Eye shadow)",4
100,0.114,"(Concealer, Mascara, Eye shadow, Eyeliner)",4
101,0.111,"(Foundation, Mascara, Eye shadow, Lip Gloss)",4


(63, 3)


In [19]:
association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.7 )

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Brushes),(Nail Polish),0.149,0.28,0.149,1.0,3.571429,0.10728,inf
1,(Lip liner),(Concealer),0.234,0.442,0.179,0.764957,1.730673,0.075572,2.374036
2,(Mascara),(Eye shadow),0.357,0.381,0.321,0.89916,2.359999,0.184983,6.138417
3,(Eye shadow),(Mascara),0.381,0.357,0.321,0.84252,2.359999,0.184983,4.08305
4,(Lip Gloss),(Foundation),0.49,0.536,0.356,0.726531,1.355468,0.09336,1.696716


In [20]:
association_rules(frequent_itemsets, metric = 'lift', min_threshold = 2 )

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Brushes),(Nail Polish),0.149,0.28,0.149,1.0,3.571429,0.10728,inf
1,(Nail Polish),(Brushes),0.28,0.149,0.149,0.532143,3.571429,0.10728,1.818931
2,(Mascara),(Eye shadow),0.357,0.381,0.321,0.89916,2.359999,0.184983,6.138417
3,(Eye shadow),(Mascara),0.381,0.357,0.321,0.84252,2.359999,0.184983,4.08305
