In [1]:
"""
    该算法首先会生成所有单个物品的列表，接着扫描交易记录来查看哪些项集满足最小支持度的要求，
    那些不满足最小支持度的集合会被去掉。然后对剩下的集合进行组合以生成两个元素的项集。接下来，
    再重复扫描交易记录，去掉不满足最小支持度的项集，该过程重复，直到所有项集都被去掉
"""

# 辅助函数
def loadDataSet():
    return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if [item] not in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))
#     return map(setify, C1)

def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        tid = set(tid)
        # print(tid, list(Ck))
        for can in Ck:
            if can.issubset(tid):
                if not ssCnt.get(can, 0):
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData ={}
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            retList.insert(0, key)
        supportData[key] = support
    return retList, supportData


dataSet = loadDataSet()

C1 = createC1(dataSet)
print("C1:", C1)
print("")

L1, suppData0 = scanD(dataSet, C1, 0.5)
print("L1:", L1)
print("")
print("suppData0", suppData0)

C1: [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]

L1: [frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

suppData0 {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75}


In [53]:
# Apriori 算法
def apriorGen(Lk, k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
        if L1 == L2:
            retList.append(L1[i] | L1[j])
    return retList


# print(aprioriGen([frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], 2))
# print(aprioriGen([frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], 3))


def apriori(dataSet, minSupport=0.5):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 2):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)
#         print('supportData:', supportData)
#         print('supK:', supK)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData


L, supportData = apriori(dataSet)
print("L:", L)
print("")
print("supportData:", supportData)

L: [[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})]]

supportData: {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({2, 3, 5}): 0.5}


### 发现毒蘑菇的相似特征

In [57]:

def generateRules(L, supportData, minConf=0.7):
    """
    函数说明：关联规则生成函数
    频繁项集列表：L
    包含那些频繁项集支持数据的字典：supportData
    最小可信度阈值：minConf
    """
    bigRuleList = []           #bigRuleList是包含可信度的规则列表，此处进行初始化
    for i in range(1, len(L)):  
        #遍历(1, len(L))是要为L[i]提供索引值
        #为什么要从(1, len(L))循环？
        #注意到 L[0]是单元素项集，我们无法从单元素项集中构建关联规则；另外，L[len(L)]是空集
        #所以，只获取有两个或者更多集合的项目
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            print(H1)
            #该函数遍历L中的每一个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1
            if (i > 1):
            #如果频繁项集元素数目超过2,那么会考虑对它做进一步的合并
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:              
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList
 
#
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    """
    函数说明：计算项集中只有两个元素的可信度。计算规则的可信度以及找到满足最小可信度要求的规则
    频繁项集：freqSet
    频繁项集中每个元素frozenset后组成的列表(可以出现在规则右部的元素列表,见234-236行):H
    包含那些频繁项集支持数据的字典：supportData
    包含可信度的规则列表bigRuleList：brl
    最小可信度阈值：minConf
    """
    prunedH = []                            #建立一个满足最小可信度要求的规则列表
    for conseq in H:                        #后件，遍历 H中的所有项集并计算它们的可信度值
        conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度计算
        if conf >= minConf:
            print (freqSet-conseq,'-->',conseq,'conf:',conf)
            #如果某条规则满足最小可信度值,那么将这些规则输出到屏幕显示p
            brl.append((freqSet-conseq, conseq, conf))
            #添加到规则里，brl是前面通过检查的bigRuleList
            prunedH.append(conseq)          #同样需要放入列表到后面检查
    return prunedH                          #返回一个满足最小可信度要求的规则列表
 
#合并
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    """
    函数说明：从最初的项集中生成更多的关联规则
    频繁项集：freqSet
    频繁项集中每个元素frozenset后组成的列表(可以出现在规则右部的元素列表,见236-238行):H
    包含那些频繁项集支持数据的字典：supportData
    包含可信度的规则列表bigRuleList：brl
    最小可信度阈值：minConf
    """
    m = len(H[0])                                #计算H中的频繁项集大小m
    print('H=',H)
    print('H[0]=',H[0])
    if (len(freqSet) > (m + 1)): 
        print('len(freqSet)=',len(freqSet))
        #查看频繁项集频繁项集freqSet是否大到可以移除大小为m的子集
        Hmp1 = aprioriGen(H, m+1)      #使用aprioriGen()来生成H中元素的无重复组合
        print('Hmp1=',Hmp1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)   #计算可信度
        print('Hmp1=',Hmp1)
        if (len(Hmp1) > 1):    
            print('len(Hmp1)=',len(Hmp1))
            #满足最小可信度要求的规则列表多于1,则递归来判断是否可以进一步组合这些规则
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)


generateRules(L, supportData)

[frozenset({2}), frozenset({3})]
[frozenset({3}), frozenset({5})]
[frozenset({2}), frozenset({5})]
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
[frozenset({1}), frozenset({3})]
frozenset({1}) --> frozenset({3}) conf: 1.0
[frozenset({2}), frozenset({3}), frozenset({5})]
H= [frozenset({2}), frozenset({3}), frozenset({5})]
H[0]= frozenset({2})
len(freqSet)= 3
Hmp1= [frozenset({2, 3}), frozenset({2, 5}), frozenset({3, 5})]
Hmp1= []


[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

In [56]:
# 关系规则生成函数
def generateRules(L, supportData, minConf=0.7):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            for freqSet in L[i]:
                H1 = [frozenset([item]) for item in freqSet]
                if (i>1):
                    rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
                else:
                    calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList


def calcConf(freqSet, H, supportData, brl, minConf):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]
        if conf >= minConf:
            print(freqSet-conseq, '-->', conseq, 'conf:', conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
            
            
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m+1)):
        Hmp1 = aprioriGen(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1)>1):
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
            
    
rules = generateRules(L, supportData)
rules

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


TypeError: object of type 'NoneType' has no len()