-
Notifications
You must be signed in to change notification settings - Fork 2
/
fpgrowth.py
184 lines (158 loc) · 6.97 KB
/
fpgrowth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# coding:utf-8
import json
import pandas as pd
def loadDataSet(df):
# df = pd.read_csv("C:/Users/DELL/Desktop/xin.csv",usecols=[0,1,2,3])
# numset = len()
matrix = df.values
headers = df.columns
m,n = df.shape
data_set = []
for i in range(m):
l = []
for j in range(n):
l.append(headers[j]+'='+str(matrix[i][j]))
data_set.append(l)
return m,data_set
def transfer2FrozenDataSet(dataSet):
frozenDataSet = {}
for elem in dataSet:
if frozenset(elem) not in frozenDataSet.keys():
frozenDataSet[frozenset(elem)] = 1
else:
frozenDataSet[frozenset(elem)] += 1
return frozenDataSet
class TreeNode:
def __init__(self, nodeName, count, nodeParent):
self.nodeName = nodeName
self.count = count
self.nodeParent = nodeParent
self.nextSimilarItem = None
self.children = {}
def increaseC(self, count):
self.count += count
def createFPTree(frozenDataSet, minSupport,numset):
headPointTable = {}
for items in frozenDataSet:
for item in items:
headPointTable[item] = headPointTable.get(item, 0) + frozenDataSet[items]
headPointTable = {k:v for k,v in headPointTable.items() if float(v)/float(numset) >= minSupport}
frequentItems = set(headPointTable.keys())
if len(frequentItems) == 0: return None, None
for k in headPointTable:
headPointTable[k] = [headPointTable[k], None]
fptree = TreeNode("null", 1, None)
for items,count in frozenDataSet.items():
frequentItemsInRecord = {}
for item in items:
if item in frequentItems:
frequentItemsInRecord[item] = headPointTable[item][0]
if len(frequentItemsInRecord) > 0:
orderedFrequentItems = [v[0] for v in sorted(frequentItemsInRecord.items(), key=lambda v:v[1], reverse = True)]
updateFPTree(fptree, orderedFrequentItems, headPointTable, count)
return fptree, headPointTable
def updateFPTree(fptree, orderedFrequentItems, headPointTable, count):
if orderedFrequentItems[0] in fptree.children:
fptree.children[orderedFrequentItems[0]].increaseC(count)
else:
fptree.children[orderedFrequentItems[0]] = TreeNode(orderedFrequentItems[0], count, fptree)
#update headPointTable
if headPointTable[orderedFrequentItems[0]][1] == None:
headPointTable[orderedFrequentItems[0]][1] = fptree.children[orderedFrequentItems[0]]
else:
updateHeadPointTable(headPointTable[orderedFrequentItems[0]][1], fptree.children[orderedFrequentItems[0]])
#handle other items except the first item
if(len(orderedFrequentItems) > 1):
updateFPTree(fptree.children[orderedFrequentItems[0]], orderedFrequentItems[1::], headPointTable, count)
def updateHeadPointTable(headPointBeginNode, targetNode):
while(headPointBeginNode.nextSimilarItem != None):
headPointBeginNode = headPointBeginNode.nextSimilarItem
headPointBeginNode.nextSimilarItem = targetNode
def mineFPTree(headPointTable, prefix, frequentPatterns, minSupport,numset):
if headPointTable == None: return
headPointItems = [v[0] for v in sorted(headPointTable.items(), key = lambda v:v[1][0])]
if(len(headPointItems) == 0): return
for headPointItem in headPointItems:
newPrefix = prefix.copy()
newPrefix.add(headPointItem)
support = headPointTable[headPointItem][0]
frequentPatterns[frozenset(newPrefix)] = support
prefixPath = getPrefixPath(headPointTable, headPointItem)
if(prefixPath != {}):
conditionalFPtree, conditionalHeadPointTable = createFPTree(prefixPath, minSupport, numset)
if conditionalHeadPointTable != None:
mineFPTree(conditionalHeadPointTable, newPrefix, frequentPatterns, minSupport, numset)
def getPrefixPath(headPointTable, headPointItem):
prefixPath = {}
beginNode = headPointTable[headPointItem][1]
prefixs = ascendTree(beginNode)
if((prefixs != [])):
prefixPath[frozenset(prefixs)] = beginNode.count
while(beginNode.nextSimilarItem != None):
beginNode = beginNode.nextSimilarItem
prefixs = ascendTree(beginNode)
if (prefixs != []):
prefixPath[frozenset(prefixs)] = beginNode.count
return prefixPath
def ascendTree(treeNode):
prefixs = []
while((treeNode.nodeParent != None) and (treeNode.nodeParent.nodeName != 'null')):
treeNode = treeNode.nodeParent
prefixs.append(treeNode.nodeName)
return prefixs
def rulesGenerator(frequentPatterns, minConf, rules):
for frequentset in frequentPatterns:
if(len(frequentset) > 1):
getRules(frequentset,frequentset, rules, frequentPatterns, minConf)
def removeStr(set, str):
tempSet = []
for elem in set:
if(elem != str):
tempSet.append(elem)
tempFrozenSet = frozenset(tempSet)
return tempFrozenSet
def getRules(frequentset,currentset, rules, frequentPatterns, minConf):
for frequentElem in currentset:
subSet = removeStr(currentset, frequentElem)
confidence = float(frequentPatterns[frequentset]) / float(frequentPatterns[subSet])
if (confidence >= minConf):
flag = False
for rule in rules:
if(rule[0] == subSet and rule[1] == frequentset - subSet):
flag = True
if(flag == False):
rules.append((subSet, frequentset - subSet, confidence))
if(len(subSet) >= 2):
getRules(frequentset, subSet, rules, frequentPatterns, minConf)
def run(df,ms,mc):
print("fptree:")
numset,dataSet = loadDataSet(df)
frozenDataSet = transfer2FrozenDataSet(dataSet)
minSupport = ms
fptree, headPointTable = createFPTree(frozenDataSet, minSupport,numset)
frequentPatterns = {}
prefix = set([])
mineFPTree(headPointTable, prefix, frequentPatterns, minSupport,numset)
print("frequent patterns:")
frequent_set = []
print len(frequentPatterns)
if len(list(frequentPatterns)) > 0:
for (k,v) in frequentPatterns.items():
item = {'itemName': list(k), 'itemNum': v, 'itemVal': float(v)/float(numset)}
frequent_set.append(item)
minConf = mc
rules = []
rulesGenerator(frequentPatterns, minConf, rules)
print("association rules:")
print len(rules)
big_rules = []
if len(list(rules)) > 0:
for k in rules:
item = {'itemName': list(k[0]), 'itemVal': list(k[1]), 'itemPercent': k[2]}
big_rules.append(item)
report_data = {}
result1 = pd.io.json.dumps(frequent_set).decode('utf-8')
report_data.update({'frequent_set': json.loads(result1)})
result2 = pd.io.json.dumps(big_rules).decode('utf-8')
report_data.update({'big_rules': json.loads(result2)})
return report_data