In [44]:
# 分析market basket 中的频繁项集和关联规则
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [45]:
# load data
data = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)
data.fillna(0, inplace=True)

In [46]:
feat_sets = set()
for col in set(data):
    feat_sets = feat_sets | set(data[col].values)
feat = list(feat_sets - {0})
# print(feat)

In [47]:
transactions = pd.DataFrame()
transactions['index'] = feat

In [49]:
for i in range(len(data)):
    transactions[i] = transactions['index'].apply(lambda x: 1 if x in list(set(data.loc[i,])-{0}) else 0)

In [54]:
transactions.set_index('index', inplace=True)
transactions = transactions.T
transactions.reset_index(inplace=True, drop=True)

In [25]:
data.head(n=3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [66]:
transactions.head(3)

index,red wine,milk,bramble,shampoo,strong cheese,water spray,pet food,cottage cheese,carrots,spinach,...,strawberries,pancakes,black tea,chutney,meatballs,vegetables mix,asparagus,whole weat flour,yogurt cake,body spray
0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [68]:
# 挖掘频繁项集，最小支持度为0.02
itemsets = apriori(transactions,use_colnames=True, min_support=0.05)

# 按照支持度从大到小进行时候粗
itemsets = itemsets.sort_values(by="support" , ascending=False) 

In [70]:
print('-'*20, '频繁项集', '-'*20)
print(itemsets)

-------------------- 频繁项集 --------------------
     support                    itemsets
10  0.238368             (mineral water)
12  0.179709                      (eggs)
15  0.174110                 (spaghetti)
1   0.170911              (french fries)
13  0.163845                 (chocolate)
4   0.132116                 (green tea)
0   0.129583                      (milk)
23  0.098254               (ground beef)
2   0.095321         (frozen vegetables)
24  0.095054                  (pancakes)
22  0.087188                   (burgers)
6   0.081056                      (cake)
8   0.080389                   (cookies)
19  0.079323                  (escalope)
17  0.076523            (low fat yogurt)
21  0.071457                    (shrimp)
3   0.068391                  (tomatoes)
16  0.065858                 (olive oil)
18  0.063325           (frozen smoothie)
14  0.062525                    (turkey)
20  0.059992                   (chicken)
27  0.059725  (mineral water, spaghetti)
5   0.0585

In [81]:
# 根据频繁项集计算关联规则，设置最小提升度为2
rules =  association_rules(itemsets, metric='lift', min_threshold=1)

# 按照提升度从大到小进行排序
rules = rules.sort_values(by="lift" , ascending=False) 

#rules.to_csv('./rules.csv')
print('-'*20, '关联规则', '-'*20)
print(rules)

-------------------- 关联规则 --------------------
       antecedents      consequents  antecedent support  consequent support  \
0  (mineral water)      (spaghetti)            0.238368            0.174110   
1      (spaghetti)  (mineral water)            0.174110            0.238368   
3      (chocolate)  (mineral water)            0.163845            0.238368   
2  (mineral water)      (chocolate)            0.238368            0.163845   
4  (mineral water)           (eggs)            0.238368            0.179709   
5           (eggs)  (mineral water)            0.179709            0.238368   

    support  confidence      lift  leverage  conviction  
0  0.059725    0.250559  1.439085  0.018223    1.102008  
1  0.059725    0.343032  1.439085  0.018223    1.159314  
3  0.052660    0.321400  1.348332  0.013604    1.122357  
2  0.052660    0.220917  1.348332  0.013604    1.073256  
4  0.050927    0.213647  1.188845  0.008090    1.043158  
5  0.050927    0.283383  1.188845  0.008090    1.06