In [43]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules, fpgrowth, fpmax

## 1 构造数据
构造一个购物数据

### 1.1 购物明细数据

In [22]:
# 明细数据
shopping_detail = pd.DataFrame(
    [
        [0, 'Eggs'],
        [0, 'Kidney Beans'],
        [0, 'Milk'],
        [0, 'Nutmeg'],
        [0, 'Onion'],
        [0, 'Yogurt'],
        [1, 'Dill'],
        [1, 'Eggs'],
        [1, 'Kidney Beans'],
        [1, 'Nutmeg'],
        [0, 'Onion'],
        [0, 'Yogurt'],
        [1, 'Dill'],
        [1, 'Eggs'],
        [1, 'Kidney Beans'],
        [1, 'Nutmeg'],
        [1, 'Onion'],
        [1, 'Yogurt'],
        [2, 'Apple'],
        [2, 'Eggs'],
        [2, 'Kidney Beans'],
        [2, 'Milk'],
        [3, 'Corn'],
        [3, 'Kidney Beans'],
        [3, 'Milk'],
        [3, 'Unicorn'],
        [3, 'Yogurt'],
        [4, 'Corn'],
        [4, 'Eggs'],
        [4, 'Ice cream'],
        [4, 'Kidney Beans'],
        [4, 'Onion']],
    columns=['id', 'item']
)
shopping_detail.head(10)

Unnamed: 0,id,item
0,0,Eggs
1,0,Kidney Beans
2,0,Milk
3,0,Nutmeg
4,0,Onion
5,0,Yogurt
6,1,Dill
7,1,Eggs
8,1,Kidney Beans
9,1,Nutmeg


In [23]:
# 物品清单
all_items_list = ["Apple", "Corn", "Dill", "Eggs", "Ice cream", "Kidney Beans", "Milk", "Nutmeg", "Onion", "Unicorn", "Yogurt"]

### 1.2 数据encode为bitmap形式

In [27]:
onehot_dt = pd.get_dummies(shopping_detail, columns=['item'], sparse=True)
bitmap_dt = onehot_dt.groupby(by='id').apply(lambda x: x.filter(regex='item_.+$').sum() > 0)
bitmap_dt

Unnamed: 0_level_0,item_Apple,item_Corn,item_Dill,item_Eggs,item_Ice cream,item_Kidney Beans,item_Milk,item_Nutmeg,item_Onion,item_Unicorn,item_Yogurt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


## 2 挖掘频繁项

### 2.1 FPMax
参考: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/

In [44]:
df_fp_max_freq_items = fpmax(bitmap_dt, min_support=0.6)
df_fp_max_freq_items

Unnamed: 0,support,itemsets
0,0.6,"(5, 6)"
1,0.6,"(8, 3, 5)"
2,0.6,"(10, 5)"


In [45]:
df_fp_max_freq_items_col = fpmax(bitmap_dt, min_support=0.6, use_colnames=True)
df_fp_max_freq_items_col

Unnamed: 0,support,itemsets
0,0.6,"(item_Kidney Beans, item_Milk)"
1,0.6,"(item_Kidney Beans, item_Onion, item_Eggs)"
2,0.6,"(item_Yogurt, item_Kidney Beans)"


### 2.2 apriori
参考: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

In [46]:
df_freq_items = apriori(bitmap_dt, min_support=0.6)
df_freq_items

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [47]:
df_freq_items_col = apriori(bitmap_dt, min_support=0.6, use_colnames=True)
df_freq_items_col

Unnamed: 0,support,itemsets
0,0.8,(item_Eggs)
1,1.0,(item_Kidney Beans)
2,0.6,(item_Milk)
3,0.6,(item_Onion)
4,0.6,(item_Yogurt)
5,0.8,"(item_Kidney Beans, item_Eggs)"
6,0.6,"(item_Onion, item_Eggs)"
7,0.6,"(item_Kidney Beans, item_Milk)"
8,0.6,"(item_Kidney Beans, item_Onion)"
9,0.6,"(item_Yogurt, item_Kidney Beans)"


### 2.3 fpgrowth
参考: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/


In [48]:
df_fp_freq_items = fpgrowth(bitmap_dt, min_support=0.6)
df_fp_freq_items

Unnamed: 0,support,itemsets
0,1.0,(5)
1,0.8,(3)
2,0.6,(10)
3,0.6,(8)
4,0.6,(6)
5,0.8,"(3, 5)"
6,0.6,"(10, 5)"
7,0.6,"(8, 3)"
8,0.6,"(8, 5)"
9,0.6,"(8, 3, 5)"


In [49]:
df_fp_freq_items_col = fpgrowth(bitmap_dt, min_support=0.6, use_colnames=True)
df_fp_freq_items_col

Unnamed: 0,support,itemsets
0,1.0,(item_Kidney Beans)
1,0.8,(item_Eggs)
2,0.6,(item_Yogurt)
3,0.6,(item_Onion)
4,0.6,(item_Milk)
5,0.8,"(item_Kidney Beans, item_Eggs)"
6,0.6,"(item_Yogurt, item_Kidney Beans)"
7,0.6,"(item_Onion, item_Eggs)"
8,0.6,"(item_Kidney Beans, item_Onion)"
9,0.6,"(item_Kidney Beans, item_Onion, item_Eggs)"


## 3 association_rules挖掘关联规则

- 输入: 上述频繁项集, 需要包含support和itemsets

- 计算方式可以基于: support、confidence、lift、leverage、conviction

In [50]:
df_associate_rules = association_rules(df_freq_items_col, min_threshold=0.1, metric="leverage")
df_associate_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(item_Onion),(item_Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(item_Eggs),(item_Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,"(item_Kidney Beans, item_Onion)",(item_Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,"(item_Kidney Beans, item_Eggs)",(item_Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(item_Onion),"(item_Kidney Beans, item_Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf
5,(item_Eggs),"(item_Kidney Beans, item_Onion)",0.8,0.6,0.6,0.75,1.25,0.12,1.6


In [51]:
df_associate_rules_by_confidence = association_rules(df_fp_freq_items_col, min_threshold=0.8, metric="confidence")
df_associate_rules_by_confidence

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(item_Kidney Beans),(item_Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
1,(item_Eggs),(item_Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
2,(item_Yogurt),(item_Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
3,(item_Onion),(item_Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(item_Onion),(item_Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
5,"(item_Kidney Beans, item_Onion)",(item_Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
6,"(item_Onion, item_Eggs)",(item_Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,(item_Onion),"(item_Kidney Beans, item_Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf
8,(item_Milk),(item_Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf


下面是一些调节参数的技巧：
- Support：
支持度是指一个规则在数据集中出现的频率。
在确定支持度的阈值时，应该考虑数据集的大小和规则的复杂度。如果数据集很大，支持度阈值可以设置得较小，以便发现更多的规则。如果数据集较小，阈值应该设置得较高，以便避免过多的无意义规则。

- Confidence：
置信度是指当条件项出现时，结果项也会出现的概率。
置信度的阈值应该根据具体情况来确定。如果要找到比较强的规则，阈值应该设置得高一些。如果希望发现更多的规则，则应该设置得低一些。

- Lift：
提升度是指条件项和结果项之间的关系与它们之间的随机关系之比。
当提升度大于1时，说明条件项和结果项之间存在正相关关系；当提升度小于1时，说明它们之间存在负相关关系；当提升度等于1时，说明它们之间没有关系。可以通过设置提升度的阈值来筛选规则。

- Leverage：
杠杆值是指条件项和结果项同时出现的频率与它们分别独立出现的频率之差。
杠杆值的绝对值越大，说明条件项和结果项之间的关联程度越高。可以通过设置杠杆值的阈值来筛选规则。

- Conviction：
确信度是指条件项和结果项之间的独立性度量，即结果项不受条件项影响的概率与实际情况下结果项不受条件项影响的概率之比。
当确信度大于1时，说明条件项和结果项之间的关系是真实存在的；当确信度小于1时，说明它们之间存在相反的关系。可以通过设置确信度的阈值来筛选规则。