|
| 1 | +# -*- coding: UTF-8 -*- |
| 2 | +""" |
| 3 | +关联分析-Apriori算法 |
| 4 | +""" |
| 5 | + |
| 6 | +''' |
| 7 | +从外部文件data.txt导入数据集,一个交易的集合 |
| 8 | +''' |
| 9 | +def load_data_set(): |
| 10 | + data_set = [] |
| 11 | + fd = file("data.txt", "r") |
| 12 | + for line in fd.readlines(): |
| 13 | + line = line.strip('\n') |
| 14 | + data_set.append(list(map(None, line.split(', ')))) |
| 15 | + return data_set |
| 16 | + |
| 17 | +''' |
| 18 | +直接从数据集构造1-候选集 |
| 19 | +''' |
| 20 | +def create_C1(data_set): |
| 21 | + C1 = set() |
| 22 | + for t in data_set: |
| 23 | + for item in t: |
| 24 | + item_set = frozenset([item]) |
| 25 | + C1.add(item_set) |
| 26 | + return C1 |
| 27 | + |
| 28 | +''' |
| 29 | +判断是否满足 |
| 30 | +''' |
| 31 | +def is_apriori(Ck_item, Lksub1): |
| 32 | + for item in Ck_item: |
| 33 | + sub_Ck = Ck_item - frozenset([item]) |
| 34 | + if sub_Ck not in Lksub1: |
| 35 | + return False |
| 36 | + return True |
| 37 | + |
| 38 | +''' |
| 39 | +生成各个候选集Ck |
| 40 | +''' |
| 41 | +def create_Ck(Lksub1, k): |
| 42 | + Ck = set() |
| 43 | + len_Lksub1 = len(Lksub1) |
| 44 | + list_Lksub1 = list(Lksub1) |
| 45 | + for i in range(len_Lksub1): |
| 46 | + for j in range(1, len_Lksub1): |
| 47 | + l1 = list(list_Lksub1[i]) |
| 48 | + l2 = list(list_Lksub1[j]) |
| 49 | + l1.sort() |
| 50 | + l2.sort() |
| 51 | + if l1[0:k-2] == l2[0:k-2]: |
| 52 | + Ck_item = list_Lksub1[i] | list_Lksub1[j] |
| 53 | + if is_apriori(Ck_item, Lksub1): |
| 54 | + Ck.add(Ck_item) |
| 55 | + return Ck |
| 56 | + |
| 57 | +''' |
| 58 | +通过候选集Ck生成频繁集Lk |
| 59 | +''' |
| 60 | +def generate_Lk_by_Ck(data_set, Ck, min_support, support_data): |
| 61 | + Lk = set() |
| 62 | + item_count = {} |
| 63 | + for t in data_set: |
| 64 | + for item in Ck: |
| 65 | + if item.issubset(t): |
| 66 | + if item not in item_count: |
| 67 | + item_count[item] = 1 |
| 68 | + else: |
| 69 | + item_count[item] += 1 |
| 70 | + t_num = float(len(data_set)) |
| 71 | + for item in item_count: |
| 72 | + if (item_count[item] / t_num) >= min_support: |
| 73 | + Lk.add(item) |
| 74 | + support_data[item] = item_count[item] / t_num |
| 75 | + return Lk |
| 76 | + |
| 77 | +''' |
| 78 | +生成各阶频繁集,最小支持度为0.2 |
| 79 | +''' |
| 80 | +def generate_L(data_set, k, min_support): |
| 81 | + support_data = {} |
| 82 | + C1 = create_C1(data_set) |
| 83 | + L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data) |
| 84 | + Lksub1 = L1.copy() |
| 85 | + L = [] |
| 86 | + L.append(Lksub1) |
| 87 | + for i in range(2, k+1): |
| 88 | + Ci = create_Ck(Lksub1, i) |
| 89 | + Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data) |
| 90 | + Lksub1 = Li.copy() |
| 91 | + L.append(Lksub1) |
| 92 | + return L, support_data |
| 93 | + |
| 94 | +''' |
| 95 | +生成从频繁集关联规则分析 |
| 96 | +''' |
| 97 | +def generate_big_rules(L, support_data, min_conf): |
| 98 | + big_rule_list = [] |
| 99 | + sub_set_list = [] |
| 100 | + for i in range(0, len(L)): |
| 101 | + for freq_set in L[i]: |
| 102 | + for sub_set in sub_set_list: |
| 103 | + if sub_set.issubset(freq_set): |
| 104 | + conf = support_data[freq_set] / support_data[freq_set - sub_set] |
| 105 | + big_rule = (freq_set - sub_set, sub_set, conf) |
| 106 | + if conf >= min_conf and big_rule not in big_rule_list: |
| 107 | + big_rule_list.append(big_rule) |
| 108 | + sub_set_list.append(freq_set) |
| 109 | + return big_rule_list |
| 110 | + |
| 111 | +if __name__ == "__main__": |
| 112 | + data_set = load_data_set() |
| 113 | + L, support_data = generate_L(data_set, k=3, min_support=0.2) |
| 114 | + big_rules_list = generate_big_rules(L, support_data, min_conf=0.7) |
| 115 | + for Lk in L: |
| 116 | + print ("=" * 50) |
| 117 | + print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport") |
| 118 | + print ("=" * 50) |
| 119 | + for freq_set in Lk: |
| 120 | + print (freq_set, support_data[freq_set]) |
| 121 | + print() |
| 122 | + print ("Big Rules") |
| 123 | + for item in big_rules_list: |
| 124 | + print (item[0], "=>", item[1], "conf: ", item[2]) |
0 commit comments