Skip to content

Commit ec6218f

Browse files
committed
add: 关联分析
1 parent c0256bb commit ec6218f

File tree

3 files changed

+434
-3
lines changed

3 files changed

+434
-3
lines changed

README.md

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,41 @@
11
# 数据挖掘算法
2-
1. 关联分析Apriori算法
3-
2. 数据分类决策树算法
4-
3. 数据聚类K-means算法
2+
1. [关联分析Apriori算法](#关联分析Apriori算法)
3+
2. [数据分类决策树算法](#数据分类决策树算法)
4+
3. [数据聚类K-means算法](#数据聚类K-means算法)
5+
6+
7+
<hr>
8+
9+
## 关联分析Apriori算法
10+
### 1. 数据集
11+
以超市交易为数据集,所有商品的项集为
12+
*I = {bread, beer, cake, cream, milk, tea}*
13+
某条交易如
14+
*Ti = {bread, beer, milk}*
15+
简化为
16+
*Ti = {a, b, d}*
17+
data.txt数据集样本如下
18+
```bash
19+
a, d, e,f
20+
a, d, e
21+
c, e
22+
e, f
23+
...
24+
```
25+
26+
### 2. 算法实现
27+
使用经典的Apriori算法,依次扫描交易记录集,计算出 *k-候选集Ck* 然后去除**支持度sup**小的项集获得 *k-频繁集Lk*, 只计算到 *3-频繁集*
28+
> 第k个候选集只会从k-1频繁集中的各项目组合连接,然后扫描记录集,以获取Ck中各项集的支持度。
29+
30+
![输出结果](https://i.loli.net/2019/06/16/5d05ad0e8f2e762317.png)
31+
32+
33+
## 数据分类决策树算法
34+
35+
36+
37+
38+
39+
40+
41+
## 数据聚类K-means算法
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# -*- coding: UTF-8 -*-
2+
"""
3+
关联分析-Apriori算法
4+
"""
5+
6+
'''
7+
从外部文件data.txt导入数据集,一个交易的集合
8+
'''
9+
def load_data_set():
10+
data_set = []
11+
fd = file("data.txt", "r")
12+
for line in fd.readlines():
13+
line = line.strip('\n')
14+
data_set.append(list(map(None, line.split(', '))))
15+
return data_set
16+
17+
'''
18+
直接从数据集构造1-候选集
19+
'''
20+
def create_C1(data_set):
21+
C1 = set()
22+
for t in data_set:
23+
for item in t:
24+
item_set = frozenset([item])
25+
C1.add(item_set)
26+
return C1
27+
28+
'''
29+
判断是否满足
30+
'''
31+
def is_apriori(Ck_item, Lksub1):
32+
for item in Ck_item:
33+
sub_Ck = Ck_item - frozenset([item])
34+
if sub_Ck not in Lksub1:
35+
return False
36+
return True
37+
38+
'''
39+
生成各个候选集Ck
40+
'''
41+
def create_Ck(Lksub1, k):
42+
Ck = set()
43+
len_Lksub1 = len(Lksub1)
44+
list_Lksub1 = list(Lksub1)
45+
for i in range(len_Lksub1):
46+
for j in range(1, len_Lksub1):
47+
l1 = list(list_Lksub1[i])
48+
l2 = list(list_Lksub1[j])
49+
l1.sort()
50+
l2.sort()
51+
if l1[0:k-2] == l2[0:k-2]:
52+
Ck_item = list_Lksub1[i] | list_Lksub1[j]
53+
if is_apriori(Ck_item, Lksub1):
54+
Ck.add(Ck_item)
55+
return Ck
56+
57+
'''
58+
通过候选集Ck生成频繁集Lk
59+
'''
60+
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
61+
Lk = set()
62+
item_count = {}
63+
for t in data_set:
64+
for item in Ck:
65+
if item.issubset(t):
66+
if item not in item_count:
67+
item_count[item] = 1
68+
else:
69+
item_count[item] += 1
70+
t_num = float(len(data_set))
71+
for item in item_count:
72+
if (item_count[item] / t_num) >= min_support:
73+
Lk.add(item)
74+
support_data[item] = item_count[item] / t_num
75+
return Lk
76+
77+
'''
78+
生成各阶频繁集,最小支持度为0.2
79+
'''
80+
def generate_L(data_set, k, min_support):
81+
support_data = {}
82+
C1 = create_C1(data_set)
83+
L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
84+
Lksub1 = L1.copy()
85+
L = []
86+
L.append(Lksub1)
87+
for i in range(2, k+1):
88+
Ci = create_Ck(Lksub1, i)
89+
Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
90+
Lksub1 = Li.copy()
91+
L.append(Lksub1)
92+
return L, support_data
93+
94+
'''
95+
生成从频繁集关联规则分析
96+
'''
97+
def generate_big_rules(L, support_data, min_conf):
98+
big_rule_list = []
99+
sub_set_list = []
100+
for i in range(0, len(L)):
101+
for freq_set in L[i]:
102+
for sub_set in sub_set_list:
103+
if sub_set.issubset(freq_set):
104+
conf = support_data[freq_set] / support_data[freq_set - sub_set]
105+
big_rule = (freq_set - sub_set, sub_set, conf)
106+
if conf >= min_conf and big_rule not in big_rule_list:
107+
big_rule_list.append(big_rule)
108+
sub_set_list.append(freq_set)
109+
return big_rule_list
110+
111+
if __name__ == "__main__":
112+
data_set = load_data_set()
113+
L, support_data = generate_L(data_set, k=3, min_support=0.2)
114+
big_rules_list = generate_big_rules(L, support_data, min_conf=0.7)
115+
for Lk in L:
116+
print ("=" * 50)
117+
print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
118+
print ("=" * 50)
119+
for freq_set in Lk:
120+
print (freq_set, support_data[freq_set])
121+
print()
122+
print ("Big Rules")
123+
for item in big_rules_list:
124+
print (item[0], "=>", item[1], "conf: ", item[2])

0 commit comments

Comments
 (0)