In [None]:
#!conda install mlxtend

In [19]:
# 데이터 천처리

import pandas as pd

# 'data/retail_data.csv' 파일을 읽어와 데이터프레임으로 저장
df = pd.read_csv('data/retail_data.csv')

# 주문 번호(OrderID) 기준으로 상품 이름(ProdName)을 묶어서 리스트로 만듦
# 각 주문이 어떤 상품들을 함께 구매했는지를 나타내는 거래 데이터로 변환
basket_df = df.groupby('OrderID')['ProdName'].apply(list).reset_index()

# 변환된 데이터프레임의 앞부분을 확인
basket_df.head()

Unnamed: 0,OrderID,ProdName
0,536365,"[WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET..."
1,536366,"[HAND WARMER UNION JACK, HAND WARMER RED POLKA..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHO..."
3,536368,"[JAM MAKING SET WITH JARS, RED COAT RACK PARIS..."
4,536369,[BATH BUILDING BLOCK WORD]


In [None]:
# fit_transform()은 fit()함수와 transform()함수를 합쳐놓은 것
# fit(): 데이터에 어떤 고유 항목들이 존재하는지 학습
# transform(): 데이터를 One-Hot Encoding. 범주형 데이터를 수치형으로 변형하는 방법.
# 각 항목을 고유한 컬럼으로 만든 뒤 범주에 속할 경우 1/true, 아닐 경우 0/false

In [6]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()  
# 거래 데이터를 인코딩하기 위해 TransactionEncoder 객체 생성

te_result = te.fit_transform(basket_df['ProdName'])  
# basket_df['ProdName'] 안의 거래 데이터를 True/False 배열로 변환  
# 각 거래에서 해당 제품이 있으면 True(또는 1), 없으면 False(또는 0)로 표시됨

te_df = pd.DataFrame(te_result, columns=te.columns_)  
# 변환된 배열(te_result)을 pandas DataFrame으로 변환  
# te.columns_에는 거래에 등장한 모든 제품명이 열(column) 이름으로 저장되어 있음

te_df.head()  
# 인코딩된 데이터프레임의 상위 5개 행을 확인


Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,SET 2 TEA TOWELS I LOVE LONDON,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,...,"WRAP, BILLBOARD FONTS DESIGN",YELLOW BREAKFAST CUP AND SAUCER,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,YOU'RE CONFUSING ME METAL SIGN,YULETIDE IMAGES GIFT WRAP SET,ZINC FINISH 15CM PLANTER POTS,ZINC METAL HEART DECORATION,ZINC WILLIE WINKIE CANDLE STICK
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
te_df.shape # 결과: 270건에 대해 1343개의 상품의 존재 여부를 표시해둔 DataFrame

(270, 1343)

In [13]:
from mlxtend.frequent_patterns import apriori

apriori(te_df, min_support = 0.05) 
# Apriori 알고리즘 구현
# 최소 지지도 min_support의 디폴트 값은 0.5

Unnamed: 0,support,itemsets
0,0.051852,(60)
1,0.062963,(84)
2,0.066667,(339)
3,0.066667,(511)
4,0.066667,(544)
...,...,...
3358,0.051852,"(642, 1315, 1252, 1253, 1286, 1311, 1289, 976,..."
3359,0.051852,"(1315, 1252, 1253, 1286, 1311, 1289, 974, 976,..."
3360,0.051852,"(642, 1315, 1252, 1253, 1286, 1289, 974, 976, ..."
3361,0.051852,"(642, 1315, 1252, 1253, 1286, 1311, 1289, 974,..."


In [None]:
# 3363개의 빈발 항목 집합 반환

In [14]:
frequent_itemsets = apriori(te_df, min_support=0.06, use_colnames=True)
# 아이템을 숫자가 아닌 실제 상품명으로 표기하기 위해 use_colnames=True 옵션 설정
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.062963,(ASSORTED COLOUR BIRD ORNAMENT)
1,0.066667,(CREAM CUPID HEARTS COAT HANGER)
2,0.066667,(GLASS STAR FROSTED T-LIGHT HOLDER)
3,0.066667,(HAND WARMER BIRD DESIGN)
4,0.077778,(HAND WARMER OWL DESIGN)
...,...,...
56,0.062963,"(WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET..."
57,0.062963,"(RED WOOLLY HOTTIE WHITE HEART., WHITE METAL L..."
58,0.062963,"(RED WOOLLY HOTTIE WHITE HEART., WHITE HANGING..."
59,0.062963,"(RED WOOLLY HOTTIE WHITE HEART., WHITE HANGING..."


In [None]:
# 61개의 빈발 항목 집합 반환

In [16]:
from mlxtend.frequent_patterns import association_rules

# association_rules() 함수: 빈발 집합 항목에서 연관 규칙을 추출
# 기준이 될 규칙을 신뢰도로 지정하기 위해 metric='confidence' 명시, 최소 신뢰도 값을 0.8로 지정하기 위해 min_threshold=0.8 명시
association_rules(frequent_itemsets, metric='confidence', min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(GLASS STAR FROSTED T-LIGHT HOLDER),(KNITTED UNION FLAG HOT WATER BOTTLE),0.066667,0.085185,0.062963,0.944444,11.086957,1.0,0.057284,16.466667,0.974790,0.708333,0.939271,0.841787
1,(GLASS STAR FROSTED T-LIGHT HOLDER),(RED WOOLLY HOTTIE WHITE HEART.),0.066667,0.103704,0.062963,0.944444,9.107143,1.0,0.056049,16.133333,0.953782,0.586207,0.938017,0.775794
2,(GLASS STAR FROSTED T-LIGHT HOLDER),(WHITE HANGING HEART T-LIGHT HOLDER),0.066667,0.137037,0.066667,1.000000,7.297297,1.0,0.057531,inf,0.924603,0.486486,1.000000,0.743243
3,(WHITE METAL LANTERN),(GLASS STAR FROSTED T-LIGHT HOLDER),0.066667,0.066667,0.062963,0.944444,14.166667,1.0,0.058519,16.800000,0.995798,0.894737,0.940476,0.944444
4,(GLASS STAR FROSTED T-LIGHT HOLDER),(WHITE METAL LANTERN),0.066667,0.066667,0.062963,0.944444,14.166667,1.0,0.058519,16.800000,0.995798,0.894737,0.940476,0.944444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,"(WHITE HANGING HEART T-LIGHT HOLDER, KNITTED U...","(RED WOOLLY HOTTIE WHITE HEART., GLASS STAR FR...",0.070370,0.062963,0.062963,0.894737,14.210526,1.0,0.058532,8.901852,1.000000,0.894737,0.887664,0.947368
155,"(WHITE METAL LANTERN, KNITTED UNION FLAG HOT W...","(RED WOOLLY HOTTIE WHITE HEART., GLASS STAR FR...",0.062963,0.062963,0.062963,1.000000,15.882353,1.0,0.058999,inf,1.000000,1.000000,1.000000,1.000000
156,"(WHITE METAL LANTERN, WHITE HANGING HEART T-LI...","(RED WOOLLY HOTTIE WHITE HEART., GLASS STAR FR...",0.066667,0.062963,0.062963,0.944444,15.000000,1.0,0.058765,16.866667,1.000000,0.944444,0.940711,0.972222
157,(GLASS STAR FROSTED T-LIGHT HOLDER),"(RED WOOLLY HOTTIE WHITE HEART., WHITE HANGING...",0.066667,0.062963,0.062963,0.944444,15.000000,1.0,0.058765,16.866667,1.000000,0.944444,0.940711,0.972222


In [None]:
# 159가지 규칙이 기준을 충족
# antecedents가 IF 부분, consequents가 THEN 부분에 대항
# 후보가 될 규칙을 선별한 후에는 지지도 Support, 신뢰도 Confidence, 향상도 Lift 수치와 비즈니스 상황을 고려하여 최적의 규칙을 찾아야 함

In [18]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
# 빈발 집합 항목(frequent_itemsets)에서 연관 규칙을 생성
# 신뢰도(confidence) 기준으로 필터링하고, 0.8 이상인 규칙만 추출
# 예: '빵'을 산 사람 중 80% 이상이 '버터'도 샀다면, 그 규칙을 포함함

rules.sort_values(by='lift', ascending=False).head()
# 생성된 연관 규칙들을 lift(향상도) 기준으로 내림차순 정렬
# 상위 5개의 규칙을 출력 (가장 의미 있는 규칙들을 우선 확인)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
79,"(RED WOOLLY HOTTIE WHITE HEART., WHITE METAL L...","(GLASS STAR FROSTED T-LIGHT HOLDER, KNITTED UN...",0.062963,0.062963,0.062963,1.0,15.882353,1.0,0.058999,inf,1.0,1.0,1.0,1.0
151,"(GLASS STAR FROSTED T-LIGHT HOLDER, KNITTED UN...","(RED WOOLLY HOTTIE WHITE HEART., WHITE METAL L...",0.062963,0.062963,0.062963,1.0,15.882353,1.0,0.058999,inf,1.0,1.0,1.0,1.0
138,"(RED WOOLLY HOTTIE WHITE HEART., GLASS STAR FR...","(WHITE METAL LANTERN, KNITTED UNION FLAG HOT W...",0.062963,0.062963,0.062963,1.0,15.882353,1.0,0.058999,inf,1.0,1.0,1.0,1.0
143,"(WHITE HANGING HEART T-LIGHT HOLDER, GLASS STA...","(RED WOOLLY HOTTIE WHITE HEART., WHITE METAL L...",0.062963,0.062963,0.062963,1.0,15.882353,1.0,0.058999,inf,1.0,1.0,1.0,1.0
146,"(WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET...","(RED WOOLLY HOTTIE WHITE HEART., GLASS STAR FR...",0.062963,0.062963,0.062963,1.0,15.882353,1.0,0.058999,inf,1.0,1.0,1.0,1.0
