### 연관규칙

In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df_raw = [['우유','라면'],['라면'],['아이스크림','우유'],['과자','라면'],['아이스크림']]
df_raw

[['우유', '라면'], ['라면'], ['아이스크림', '우유'], ['과자', '라면'], ['아이스크림']]

### 데이터 구성하기
### 데이터 전처리

In [3]:
#items 중 index 0번이 갖고 있으면 True, 아니면 False
enc = TransactionEncoder()
df_raw_enc = enc.fit_transform(X=df_raw)

#위의 함수는 데이터를 행렬의 형태로 반환
df_asso = pd.DataFrame(df_raw_enc, columns = enc.columns_)
df_asso.head()

Unnamed: 0,과자,라면,아이스크림,우유
0,False,True,False,True
1,False,True,False,False
2,False,False,True,True
3,True,True,False,False
4,False,False,True,False


###  라이브러리 불러오기
### 규칙 척도 임계값(지지도 50% 이상)

In [7]:
#최소 지지도
v_min_sup = 0.1

#지지도 계산
df_freq = apriori(df_asso, min_support = v_min_sup, use_colnames = True)
df_freq.round(3)

Unnamed: 0,support,itemsets
0,0.2,(과자)
1,0.6,(라면)
2,0.4,(아이스크림)
3,0.4,(우유)
4,0.2,"(라면, 과자)"
5,0.2,"(라면, 우유)"
6,0.2,"(아이스크림, 우유)"


### 라이브러리 불러오기
### 지지도, 신뢰도, 향상도 산출

In [14]:
#기준 = 신뢰도, 최소 신뢰도 = 0.5
df_asso_rule = association_rules(df_freq, metric = 'lift', min_threshold = 1.5)
df_asso_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(맥주),(콜라),0.5,0.667,0.5,1.0,1.5,0.167,inf
1,(콜라),(맥주),0.667,0.5,0.5,0.75,1.5,0.167,2.0
2,(소주),(와인),0.667,0.333,0.333,0.5,1.5,0.111,1.333
3,(와인),(소주),0.333,0.667,0.333,1.0,1.5,0.111,inf
4,(콜라),(와인),0.667,0.333,0.333,0.5,1.5,0.111,1.333
5,(와인),(콜라),0.333,0.667,0.333,1.0,1.5,0.111,inf
6,"(소주, 맥주)",(콜라),0.333,0.667,0.333,1.0,1.5,0.111,inf
7,(콜라),"(소주, 맥주)",0.667,0.333,0.333,0.5,1.5,0.111,1.333
8,"(소주, 콜라)",(와인),0.5,0.333,0.333,0.667,2.0,0.167,2.0
9,"(와인, 콜라)",(소주),0.333,0.667,0.333,1.0,1.5,0.111,inf


## 실습

In [15]:
df_raw = pd.read_csv('/home/piai/Downloads/상품구매.csv')
df_raw

Unnamed: 0,ID,PRODUCT
0,C-11,BAGUETTE
1,C-11,HERRING
2,C-11,AVOCADO
3,C-11,ARTICHOKE
4,C-11,HEINEKEN
...,...,...
135,C-30,COKE
136,C-30,TURKEY
137,C-30,ICE CREAM
138,C-30,SODA


In [16]:
#고객 ID
ID = list(set(df_raw['ID']))
ID.sort()

list_association = []
for i in ID:
    #ID별 구매 데이터 추출
    tmp_list = list(df_raw[df_raw['ID'] == i]['PRODUCT'])
    tmp_list.sort()
    #ID별 구매 데이터를 리스트에 담기
    list_association.append(tmp_list)
    
#리스트 출력
for row in list_association:
    print(row)

['APPLES', 'ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['APPLES', 'CORNED BEEF', 'HEINEKEN', 'HERRING', 'OLIVES', 'SARDINES', 'STEAK']
['APPLES', 'AVOCADO', 'BAGUETTE', 'ICE CREAM', 'PEPPERS', 'SARDINES', 'STEAK']
['APPLES', 'COKE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'BOURBON', 'COKE', 'HAM', 'ICE CREAM', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'COKE', 'HEINEKEN', 'HERRING', 'TURKEY']
['APPLES', 'CHICKEN', 'COKE', 'CORNED BEEF', 'HEINEKEN', 'ICE CREAM', 'SARDINES']
['BAGUETTE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'PEPPERS', 'SODA']
['BOURBON', 'CRACKERS', 'HEINEKEN', 'HERRING', 'OLIVES', 'SODA', 'STEAK']
['APPLES', 'BAGUETTE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'BOURBON', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['ARTICHOKE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'SODA', 'STEAK']
['BOURBON', 'CORNED BEEF', 'CRACKERS', 'HEINEKEN', 'HERRING', 

In [26]:
#items 중 index 0번이 갖고 있으면 True, 아니면 False
enc = TransactionEncoder()
df_raw_enc = enc.fit_transform(X=list_association)

#위의 함수는 데이터를 행렬의 형태로 반환
df_asso = pd.DataFrame(df_raw_enc, columns = enc.columns_)
df_asso.head()

Unnamed: 0,APPLES,ARTICHOKE,AVOCADO,BAGUETTE,BOURBON,CHICKEN,COKE,CORNED BEEF,CRACKERS,HAM,HEINEKEN,HERRING,ICE CREAM,OLIVES,PEPPERS,SARDINES,SODA,STEAK,TURKEY
0,True,True,True,True,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,True,False,False,True,True,False,True,False,True,False,True,False
2,True,False,True,True,False,False,False,False,False,False,False,False,True,False,True,True,False,True,False
3,True,False,False,False,False,False,True,True,False,True,False,True,False,True,False,False,False,False,True
4,False,True,False,False,True,False,True,False,False,True,False,False,True,True,False,False,False,False,True


In [32]:
#최소 지지도- 조정하면서 갯수 확인해보기
v_min_sup = 0.3

#지지도 계산
df_freq = apriori(df_asso, min_support = v_min_sup, use_colnames = True)
df_freq.round(3)

Unnamed: 0,support,itemsets
0,0.4,(APPLES)
1,0.3,(ARTICHOKE)
2,0.4,(BAGUETTE)
3,0.55,(BOURBON)
4,0.5,(COKE)
5,0.4,(CORNED BEEF)
6,0.6,(HEINEKEN)
7,0.4,(HERRING)
8,0.45,(ICE CREAM)
9,0.65,(OLIVES)


In [36]:
#기준 = 신뢰도, 최소 신뢰도 = 0.5
df_asso_rule = association_rules(df_freq, metric = 'lift', min_threshold = 1.5)
df_asso_rule.round(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BOURBON),(SODA),0.55,0.4,0.4,0.727,1.818,0.18,2.2
1,(SODA),(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf
2,(ICE CREAM),(COKE),0.45,0.5,0.4,0.889,1.778,0.175,4.5
3,(COKE),(ICE CREAM),0.5,0.45,0.4,0.8,1.778,0.175,2.75
4,(COKE),(TURKEY),0.5,0.4,0.35,0.7,1.75,0.15,2.0
5,(TURKEY),(COKE),0.4,0.5,0.35,0.875,1.75,0.15,4.0
6,(HERRING),(CORNED BEEF),0.4,0.4,0.3,0.75,1.875,0.14,2.4
7,(CORNED BEEF),(HERRING),0.4,0.4,0.3,0.75,1.875,0.14,2.4
8,(SODA),(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf
9,(OLIVES),(SODA),0.65,0.4,0.4,0.615,1.538,0.14,1.56


지지도: 전체 거래에서 a와 b를 동시에 구매한 비율  
신뢰도: a거래수에서 a와 b를 동시에 구매한 비율  
향상도 : 전체 거래 수에서 B만 구매한 비율 대비 A를 구매한 경우 B를 동시에 구매한 비율

In [None]:
* 오분류율: 1- 정분류율 
* 정분류율 :{ (실제1,예측1) + (실제0, 예측0) } / 전체 빈도
* 민감도 : (실제1,예측1)/실제1 = 실제 1사건에서의 정분류율
* 특이도 : (실제0, 예측0)/실제0 = 실제 0사건에서의 정분류율
    
AUC는 roc 아래면적으로 0.9~1사이 매우 정확, 0.7~0.9 정확, 그 아래 덜 정확

*F1 점수
정밀도 : 예측긍정에서 실제긍정 비율  => TP/(TP+FP)
재현율 : 실제긍정에서 예측긍정의 비율 = 민감도 => TP/(TP+FN)
F1 점수 : 정밀도와 재현율의 조화 평균
    
* 향상도(lift)
임의 모델 댑 해당 모델의 향상된 정확도 비율 -> 등급이 늘어날 수록 값이 떨어짐. 