In [9]:
import pandas as pd
import time

In [10]:
# 数据加载
data = pd.read_csv('./BreadBasket_DMS.csv')
# 统一小写
data['Item'] = data['Item'].str.lower()
# 去掉none项
data = data.drop(data[data.Item == 'none'].index)

In [11]:
# 采用efficient_apriori工具包
def rule1():
	from efficient_apriori import apriori
	start = time.time()
	# 得到一维数组orders_series，并且将Transaction作为index, value为Item取值
	orders_series = data.set_index('Transaction')['Item']
	# 将数据集进行格式转换
	transactions = []
	temp_index = 0
	for i, v in orders_series.items():
		if i != temp_index:
			temp_set = set()
			temp_index = i
			temp_set.add(v)
			transactions.append(temp_set)
		else:
			temp_set.add(v)
	
	# 挖掘频繁项集和频繁规则
	itemsets, rules = apriori(transactions, min_support=0.02,  min_confidence=0.5)
	print('频繁项集：', itemsets)
	print('关联规则：', rules)
	end = time.time()
	print("用时：", end-start)

In [12]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [13]:
# 采用mlxtend.frequent_patterns工具包
def rule2():
	from mlxtend.frequent_patterns import apriori
	from mlxtend.frequent_patterns import association_rules
	pd.options.display.max_columns=100
	start = time.time()
	hot_encoded_df=data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
	hot_encoded_df = hot_encoded_df.applymap(encode_units)
	frequent_itemsets = apriori(hot_encoded_df, min_support=0.02, use_colnames=True)
	rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)
	print("频繁项集：", frequent_itemsets)
	print("关联规则：", rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.5) ])
	#print(rules['confidence'])
	end = time.time()
	print("用时：", end-start)

In [14]:
rule1()

频繁项集： {1: {('scandinavian',): 275, ('hot chocolate',): 552, ('cookies',): 515, ('muffin',): 364, ('pastry',): 815, ('coffee',): 4528, ('bread',): 3096, ('medialuna',): 585, ('tea',): 1350, ('farm house',): 371, ('juice',): 365, ('soup',): 326, ('cake',): 983, ('sandwich',): 680, ('alfajores',): 344, ('brownie',): 379, ('truffles',): 192, ('toast',): 318, ('scone',): 327}, 2: {('bread', 'coffee'): 852, ('bread', 'pastry'): 276, ('coffee', 'pastry'): 450, ('coffee', 'medialuna'): 333, ('coffee', 'tea'): 472, ('bread', 'tea'): 266, ('coffee', 'juice'): 195, ('coffee', 'hot chocolate'): 280, ('coffee', 'cookies'): 267, ('cake', 'coffee'): 518, ('cake', 'tea'): 225, ('bread', 'cake'): 221, ('coffee', 'sandwich'): 362, ('coffee', 'toast'): 224}}
关联规则： [{pastry} -> {coffee}, {medialuna} -> {coffee}, {juice} -> {coffee}, {hot chocolate} -> {coffee}, {cookies} -> {coffee}, {cake} -> {coffee}, {sandwich} -> {coffee}, {toast} -> {coffee}]
用时： 0.1864931583404541


In [15]:
rule2()

频繁项集：      support                 itemsets
0   0.036348              (alfajores)
1   0.327134                  (bread)
2   0.040046                (brownie)
3   0.103867                   (cake)
4   0.478445                 (coffee)
5   0.054417                (cookies)
6   0.039201             (farm house)
7   0.058326          (hot chocolate)
8   0.038567                  (juice)
9   0.061813              (medialuna)
10  0.038462                 (muffin)
11  0.086116                 (pastry)
12  0.071851               (sandwich)
13  0.029057           (scandinavian)
14  0.034552                  (scone)
15  0.034446                   (soup)
16  0.142646                    (tea)
17  0.033601                  (toast)
18  0.020287               (truffles)
19  0.023352            (cake, bread)
20  0.090025          (coffee, bread)
21  0.029163          (pastry, bread)
22  0.028107             (tea, bread)
23  0.054734           (cake, coffee)
24  0.023774              (cake, tea)
25  0.