## 1. import libraries ##

In [11]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt

## 2. import data ##

In [19]:
d = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv', sep=',')
df = d
print(df.head(5))

        0       1     2       3       4       5       6
0   Bread    Wine  Eggs    Meat  Cheese  Pencil  Diaper
1   Bread  Cheese  Meat  Diaper    Wine    Milk  Pencil
2  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
3  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
4    Meat  Pencil  Wine     NaN     NaN     NaN     NaN


## 3. view data ##

In [6]:
## check unique items ##
unique_li = [df[col].unique().tolist() for col in df.columns]
items = set([item for item_li in unique_li for item in item_li if item is not np.NaN ])
items

{'Bagel',
 'Bread',
 'Cheese',
 'Diaper',
 'Eggs',
 'Meat',
 'Milk',
 'Pencil',
 'Wine'}

## One-hot encoding : (한 칼럼에 목록 다 모여져 있을 때 이 data에는 적용 X) ##
te = TransactionEncoder()
te_ary = te.fit(df).transform(df)
ohe_df = pd.DataFrame(te_ary, columns = te.columns_)

In [50]:
## one-hot encoding(0 or 1) item_rows ##
encoded_vals = []

for index, row in df.iterrows():
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc_item in uncommons:
        labels[uc_item] = 0
    for com_item in commons:
        labels[com_item] = 1
    encoded_vals.append(labels)

ohe_df = pd.DataFrame(encoded_vals)

In [51]:
# 지지도 도출 : min_support로 일정 support 이상 rule만 도출
freq_itemsets = apriori(ohe_df, min_support = 0.1, use_colnames = True)
freq_itemsets

Unnamed: 0,support,itemsets
0,0.501587,(Milk)
1,0.425397,(Bagel)
2,0.501587,(Cheese)
3,0.438095,(Wine)
4,0.504762,(Bread)
...,...,...
95,0.111111,"(Bread, Meat, Pencil)"
96,0.101587,"(Cheese, Wine, Milk, Meat)"
97,0.104762,"(Cheese, Wine, Eggs, Milk)"
98,0.152381,"(Cheese, Eggs, Milk, Meat)"


In [33]:
#특정 개수 이상의 itemsets만 추출 (Optional)
freq_itemsets['length'] = freq_itemsets['itemsets'].apply(lambda x : len(x))
freq_itemsets[freq_itemsets['length']>2]
#특정 item이 포함된 것만 추출 (Optional)
freq_itemsets[freq_itemsets['itemsets'].apply(lambda x: 'Bread' in list(x))]

Unnamed: 0,support,itemsets,length
4,0.504762,(Bread),1
12,0.279365,"(Bread, Milk)",2
19,0.279365,"(Bread, Bagel)",2
25,0.238095,"(Cheese, Bread)",2
30,0.244444,"(Bread, Wine)",2
35,0.231746,"(Bread, Diaper)",2
36,0.2,"(Bread, Pencil)",2
37,0.206349,"(Bread, Meat)",2
38,0.187302,"(Bread, Eggs)",2
45,0.171429,"(Bread, Milk, Bagel)",3


In [54]:
# 연관 규칙 도출 : 최소 신뢰도 0.6
rules = association_rules(freq_itemsets, metric = "confidence", min_threshold=0.3).sort_values(by = ['lift', 'confidence', 'support'], ascending = False)

In [None]:
# 연관 규칙 도출 : 최소 향상도 0.7
rules = association_rules(freq_itemsets, metric = "lift", min_threshold=0.7)

In [55]:
# 특정 상품에 어울리는 조합 찾기(frozenset) #
rules[rules['antecedents']==frozenset({'Wine'})].sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
168,(Wine),"(Cheese, Diaper)",0.438095,0.2,0.136508,0.311594,1.557971,0.048889,1.162105
231,(Wine),"(Bread, Meat)",0.438095,0.206349,0.133333,0.304348,1.474916,0.042933,1.140873
222,(Wine),"(Bread, Diaper)",0.438095,0.231746,0.149206,0.34058,1.469625,0.04768,1.165044
164,(Wine),"(Cheese, Bread)",0.438095,0.238095,0.142857,0.326087,1.369565,0.038549,1.130568
44,(Wine),(Diaper),0.438095,0.406349,0.234921,0.536232,1.319633,0.056901,1.28006
257,(Wine),"(Eggs, Meat)",0.438095,0.266667,0.149206,0.34058,1.277174,0.032381,1.112088
111,(Wine),"(Milk, Eggs)",0.438095,0.244444,0.136508,0.311594,1.274704,0.029418,1.097544
184,(Wine),"(Cheese, Eggs)",0.438095,0.298413,0.165079,0.376812,1.26272,0.034346,1.125803
46,(Wine),(Pencil),0.438095,0.361905,0.2,0.456522,1.261442,0.041451,1.174095
50,(Wine),(Eggs),0.438095,0.438095,0.24127,0.550725,1.257089,0.049342,1.250691
